# Load data

In [5]:
import pandas as pd
import numpy as np

consDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-consDF.pqt")
consDF = consDF.drop(columns = ["credit_score"])
testDF = consDF[consDF['DQ_TARGET'].isna()]
consDF = consDF.dropna()
acctDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt")
acctDF = acctDF[acctDF["prism_consumer_id"].isin(consDF['prism_consumer_id'])]
trxnDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt")
trxnDF = trxnDF[trxnDF["prism_consumer_id"].isin(consDF['prism_consumer_id'])]
catmap = pd.read_csv("/uss/hdsi-prismdata/q2-ucsd-cat-map.csv")

# Define global vars (can change)

In [6]:
# global variabes (can change)
income_cats = [2,3,5,7,8,9,49]
windows = [1,3,6,9]
categories = catmap['category_id'].unique()

# Import module (feature py)

In [12]:
from features import income_ratio_features as irf

# Calling the functions

## Create netflow (spending-income ratio) features
- `m1` is a intermediate consumer-month-level df which can be used later for future feature extractions (not needed rn)
- `netflow_feats` is the consumer level netflow df produced

In [14]:
# EX: 1m table (not consumer level)
m1 = irf.build_monthly_cashflows(
    trxnDF,
    window=1,
    income_cats=income_cats,
    return_consumer_level=False
)

# EX: 1m table (not consumer level)
m3 = irf.build_monthly_cashflows(
    trxnDF,
    window=3,
    income_cats=income_cats,
    return_consumer_level=False
)

# base consumer totals --> cols: months_observed, total_income, total_spend, total_net_flow
netflow_feats = (
    m1.groupby("prism_consumer_id", as_index=False)
      .agg(
          months_observed=("month", "nunique"),
          total_income=("income_1m", "sum"),
          total_spend=("spend_1m", "sum"),
          total_net_flow=("net_flow_1m", "sum"),
      )
)

# add windowed consumer-level summary stats --> 1m, 3m, 6m, etc.
for w in windows:
    cons_w = irf.build_monthly_cashflows(
        trxnDF,
        window=w,
        income_cats=income_cats,
        return_consumer_level=True
    )
    netflow_feats = netflow_feats.merge(cons_w, on="prism_consumer_id", how="left")

In [15]:
# consumer-month-level
m1

Unnamed: 0,month,income,spend,prism_consumer_id,income_1m,spend_1m,net_flow_1m
0,2021-03-01,1000.22,1999.35,0,1000.22,1999.35,-999.13
1,2021-04-01,2167.37,2379.93,0,2167.37,2379.93,-212.56
2,2021-05-01,966.68,765.50,0,966.68,765.50,201.18
3,2021-06-01,185.70,3652.37,0,185.70,3652.37,-3466.67
4,2021-07-01,2144.69,2276.21,0,2144.69,2276.21,-131.52
...,...,...,...,...,...,...,...
72063,2020-11-01,3415.00,3050.31,999,3415.00,3050.31,364.69
72064,2020-12-01,3524.88,4425.33,999,3524.88,4425.33,-900.45
72065,2021-01-01,4340.00,5485.97,999,4340.00,5485.97,-1145.97
72066,2021-02-01,4080.38,7895.62,999,4080.38,7895.62,-3815.24


In [16]:
m3

Unnamed: 0,month,income,spend,prism_consumer_id,income_3m,spend_3m,net_flow_3m
0,2021-03-01,1000.22,1999.35,0,,,
1,2021-04-01,2167.37,2379.93,0,,,
2,2021-05-01,966.68,765.50,0,4134.27,5144.78,-1010.51
3,2021-06-01,185.70,3652.37,0,3319.75,6797.80,-3478.05
4,2021-07-01,2144.69,2276.21,0,3297.07,6694.08,-3397.01
...,...,...,...,...,...,...,...
72063,2020-11-01,3415.00,3050.31,999,4339.00,4411.49,-72.49
72064,2020-12-01,3524.88,4425.33,999,7683.88,8045.11,-361.23
72065,2021-01-01,4340.00,5485.97,999,11279.88,12961.61,-1681.73
72066,2021-02-01,4080.38,7895.62,999,11945.26,17806.92,-5861.66


In [18]:
# consumer-level netflows
netflow_feats

Unnamed: 0,prism_consumer_id,months_observed,total_income,total_spend,total_net_flow,avg_1m_netflow,med_1m_netflow,sd_1m_netflow,min_1m_netflow,max_1m_netflow,...,avg_6m_netflow,med_6m_netflow,sd_6m_netflow,min_6m_netflow,max_6m_netflow,avg_9m_netflow,med_9m_netflow,sd_9m_netflow,min_9m_netflow,max_9m_netflow
0,0,7,9320.56,14908.41,-5587.85,-798.264286,-212.560,1363.160352,-3466.67,513.28,...,-5344.925,-5344.925,1069.435367,-6101.13,-4588.72,,,,,
1,1,7,13411.59,23098.37,-9686.78,-1383.825714,-1039.490,1234.856462,-3232.42,79.83,...,-8209.595,-8209.595,1732.715670,-9434.81,-6984.38,,,,,
2,10,7,15420.74,21766.60,-6345.86,-906.551429,-978.850,601.382838,-1915.14,-197.84,...,-5633.285,-5633.285,535.117199,-6011.67,-5254.90,,,,,
3,100,6,24411.78,39742.61,-15330.83,-2555.138333,-2087.665,2408.512923,-5515.09,624.68,...,-15330.830,-15330.830,,-15330.83,-15330.83,,,,,
4,1000,7,48378.60,77914.99,-29536.39,-4219.484286,-618.080,7192.050945,-18367.94,1348.32,...,-28927.255,-28927.255,12.650140,-28936.20,-28918.31,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11596,995,7,36558.09,31466.47,5091.62,727.374286,1065.550,2194.693702,-3745.92,3036.70,...,4245.425,4245.425,310.214816,4026.07,4464.78,,,,,
11597,996,7,4107.21,196249.61,-192142.40,-27448.914286,-26736.070,15990.433902,-56765.29,-7395.93,...,-171790.530,-171790.530,18322.466061,-184746.47,-158834.59,,,,,
11598,997,7,33797.46,81076.60,-47279.14,-6754.162857,-6003.820,2024.975140,-10818.18,-4970.93,...,-41756.060,-41756.060,167.867150,-41874.76,-41637.36,,,,,
11599,998,7,38813.02,66207.01,-27393.99,-3913.427143,-2019.060,5029.301050,-11758.49,560.53,...,-22484.075,-22484.075,7736.377511,-27954.52,-17013.63,,,,,


## Create category-to-income ratio features
- `monthly` is an intermediate consumer-month-level df which can be used later for future feature extractions (not needed rn)
- `cat_ratio_feats` is the consumer-level cat-income-ratio df

In [20]:
monthly = irf.build_monthly_category_to_income(
    txn_df=trxnDF,
    category_ids=[18, 20],
    income_cats=income_cats,
    window=(3, 6),
    consumer_level=False,
)

cat_ratio_feats = irf.build_monthly_category_to_income(
    txn_df=trxnDF,
    income_cats=income_cats,
    window=windows,
    category_ids=categories,
    consumer_level=True,
)

  return df.groupby(group_col, as_index=False).agg(**agg_spec)


In [21]:
monthly

Unnamed: 0,prism_consumer_id,month,income_1m,cat18_spend_1m,cat20_spend_1m,income_3m,cat18_spend_3m,cat20_spend_3m,cat18_to_income_ratio_3m,cat20_to_income_ratio_3m,income_6m,cat18_spend_6m,cat20_spend_6m,cat18_to_income_ratio_6m,cat20_to_income_ratio_6m
0,0,2021-03-01,1000.22,119.87,0.00,,,,,,,,,,
1,0,2021-04-01,2167.37,151.95,6.00,,,,,,,,,,
2,0,2021-05-01,966.68,0.00,35.41,4134.27,271.82,41.41,0.065748,0.010016,,,,,
3,0,2021-06-01,185.70,248.23,71.39,3319.75,400.18,112.80,0.120545,0.033978,,,,,
4,0,2021-07-01,2144.69,297.02,0.00,3297.07,545.25,106.80,0.165374,0.032392,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67515,999,2020-11-01,3415.00,60.56,215.06,4339.00,79.70,231.02,0.018368,0.053243,,,,,
67516,999,2020-12-01,3524.88,216.32,134.80,7683.88,296.02,365.82,0.038525,0.047609,,,,,
67517,999,2021-01-01,4340.00,119.00,20.98,11279.88,395.88,370.84,0.035096,0.032876,,,,,
67518,999,2021-02-01,4080.38,193.23,1516.96,11945.26,528.55,1672.74,0.044248,0.140034,16284.26,608.25,1903.76,0.037352,0.116908


In [22]:
cat_ratio_feats

Unnamed: 0,prism_consumer_id,avg_cat0_spend_1m,med_cat0_spend_1m,sd_cat0_spend_1m,min_cat0_spend_1m,max_cat0_spend_1m,avg_cat0_to_income_ratio_1m,med_cat0_to_income_ratio_1m,sd_cat0_to_income_ratio_1m,min_cat0_to_income_ratio_1m,...,avg_cat49_spend_9m,med_cat49_spend_9m,sd_cat49_spend_9m,min_cat49_spend_9m,max_cat49_spend_9m,avg_cat49_to_income_ratio_9m,med_cat49_to_income_ratio_9m,sd_cat49_to_income_ratio_9m,min_cat49_to_income_ratio_9m,max_cat49_to_income_ratio_9m
0,0,67.338571,43.64,61.742993,10.00,176.44,0.097667,0.045144,0.163582,0.009998,...,,,,,,,,,,
1,1,1300.428571,1065.00,696.831605,569.00,2619.00,0.753606,0.584402,0.506534,0.301338,...,,,,,,,,,,
2,10,271.507143,230.00,273.997385,0.00,600.55,0.141775,0.068539,0.160186,0.000000,...,,,,,,,,,,
3,100,1992.113333,1860.16,969.847252,1101.88,3160.16,0.527689,0.442395,0.341886,0.251854,...,,,,,,,,,,
4,1000,7054.987143,4546.66,7569.116043,1000.00,22899.63,1.046170,0.702107,0.784976,0.265232,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,995,0.148571,0.00,0.393083,0.00,1.04,0.000018,0.000000,0.000046,0.000000,...,,,,,,,,,,
11512,996,11337.808571,11500.00,8273.802795,0.00,23971.42,1.625856,1.625856,2.299308,0.000000,...,,,,,,,,,,
11513,997,4655.654286,4843.70,1034.651032,2454.07,5559.99,2.388590,1.027782,3.816735,0.584379,...,,,,,,,,,,
11514,998,2689.627143,900.00,3106.537795,0.00,7657.39,0.650571,0.125355,0.980806,0.000000,...,,,,,,,,,,


# Merge all feats into 1 DF and perform mean imputation

In [24]:
mean_impute = netflow_feats.merge(cat_ratio_feats, on = 'prism_consumer_id')

for c in mean_impute.columns:
    if c != 'prism_consumer_id':
        mean_impute[c] = mean_impute[c].fillna(mean_impute[c].mean())

mean_impute

Unnamed: 0,prism_consumer_id,months_observed,total_income,total_spend,total_net_flow,avg_1m_netflow,med_1m_netflow,sd_1m_netflow,min_1m_netflow,max_1m_netflow,...,avg_cat49_spend_9m,med_cat49_spend_9m,sd_cat49_spend_9m,min_cat49_spend_9m,max_cat49_spend_9m,avg_cat49_to_income_ratio_9m,med_cat49_to_income_ratio_9m,sd_cat49_to_income_ratio_9m,min_cat49_to_income_ratio_9m,max_cat49_to_income_ratio_9m
0,0,7,9320.56,14908.41,-5587.85,-798.264286,-212.560,1363.160352,-3466.67,513.28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,7,13411.59,23098.37,-9686.78,-1383.825714,-1039.490,1234.856462,-3232.42,79.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,7,15420.74,21766.60,-6345.86,-906.551429,-978.850,601.382838,-1915.14,-197.84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100,6,24411.78,39742.61,-15330.83,-2555.138333,-2087.665,2408.512923,-5515.09,624.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000,7,48378.60,77914.99,-29536.39,-4219.484286,-618.080,7192.050945,-18367.94,1348.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,995,7,36558.09,31466.47,5091.62,727.374286,1065.550,2194.693702,-3745.92,3036.70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11512,996,7,4107.21,196249.61,-192142.40,-27448.914286,-26736.070,15990.433902,-56765.29,-7395.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11513,997,7,33797.46,81076.60,-47279.14,-6754.162857,-6003.820,2024.975140,-10818.18,-4970.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11514,998,7,38813.02,66207.01,-27393.99,-3913.427143,-2019.060,5029.301050,-11758.49,560.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### `mean_impute` is the final df with all produced features and consumer id column