In [1]:
import numpy as np
import pandas as pd
import pyarrow
import fastparquet
from pandasql import sqldf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder



# 1. Segment the users into segments that represent the ways that a users is paid. For instance bi-weekly vs pension vs gig economy. Ensure that you can model a user with multiple income streams. Please provide an example segmentation

### For step 1: i will segment data by kewords in column "name" and then by dates...

In [2]:
#Data pre processing
transactions = pd.read_parquet("data_science_take_home_transactions.parquet", engine='pyarrow')
advances = pd.read_parquet("data_science_take_home_advances.parquet", engine='pyarrow')

In [3]:
# Create a copy of the 'account_id' column as 'account_id_string'
transactions['account_id_string'] = transactions['account_id'].copy()
advances['account_id_string'] = advances['account_id'].copy()
# Create a label encoder object
label_encoder = LabelEncoder()

# Transform 'account_id' column to numerical values and replace the original column in 'y'
transactions['account_id'] = label_encoder.fit_transform(transactions['account_id'])
advances['account_id'] = label_encoder.fit_transform(advances['account_id'])

# Display the updated columns 'account_id' and 'account_id_string'
print(advances[['account_id', 'account_id_string']].head())

   account_id account_id_string
0          26     0adv34tf69c76
1         608     ahmdvxy8kq3gj
2         103     1jq4t1y123xd4
3        1177     k9tw3fj0c6fp0
4         650     bahm3606b8rse


In [4]:
#Only looking at postivie numbers and ignoring charges for now.
#Noticed that for payment_channel: "online" and "in-store" can be filtered out bc they seem to exclude salaries.
selected_data = transactions[(transactions['payment_channel'] != 'online') &
                             (transactions['payment_channel'] != 'in store') &
                             (transactions['f0_'] > 0)]

In [11]:
#Categorize transactions into income categories

def categorize_income(description):
    description_lower = description.lower()
    
    bi_weekly_keywords = ['bi-weekly', 'salary', 'paycheck', 'wage', 'payment', 'payroll']
    gig_keywords = ['gig', 'contractor', 'freelance', 'uber', 'lyft']
    pension_keywords = ['pension', 'retirement', 'annuity']
    
    non_income_keywords = ['atm', 'fee', 'charge', 'expense', 'tax', 'withdrawal']  # Add non-income keywords
    
    # Check if non-income keywords are present in the transaction name
    if any(keyword in description_lower for keyword in non_income_keywords):
        return 'Non-Income'  # Assign a label for non-income transactions
    
    if any(keyword in description_lower for keyword in bi_weekly_keywords):
        return 'Salaried'
    elif any(keyword in description_lower for keyword in gig_keywords):
        return 'Gig Economy'
    elif any(keyword in description_lower for keyword in pension_keywords):
        return 'Pension'
    else:
        return 'Other'

In [12]:
#Categorize transactions into income categories
selected_data['income_category'] = selected_data['name'].apply(categorize_income)  # Use function to categorize income types

#Aggregate transactions per user for income categories
user_income_summary = selected_data.groupby(['account_id', 'income_category']).agg({'f0_': ['count', 'sum']})
user_income_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['income_category'] = selected_data['name'].apply(categorize_income)  # Use function to categorize income types


Unnamed: 0_level_0,Unnamed: 1_level_0,f0_,f0_
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
account_id,income_category,Unnamed: 2_level_2,Unnamed: 3_level_2
0,Other,45,10101.68
0,Salaried,62,20939.01
1,Non-Income,3,4280.00
1,Other,15,4920.27
2,Other,88,61244.64
...,...,...,...
1902,Non-Income,6,120.00
1902,Other,97,22612.11
1902,Salaried,1,70.00
1903,Non-Income,13,89.25


In [13]:
selected_data

Unnamed: 0,account_id,id,version,pending,removed,f0_,currency_code,date,authorized_date,name,city,region,payment_meta_payment_method,payment_meta_payment_processor,payment_channel,bank_account_id,account_id_string,income_category
4,1665,bQjKpjODw8cLVEZB6kD9fez4dN04ZQs9nVoz0,1.000000000,0,0,54.18,USD,2021-11-24,,"VISA TRANSFER, *****30100057355, AUT 112321 VI...",,,,,other,00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,Other
11,1665,Q87Qp7vXkJTxJjQZb97NtaMyELjyQDTJ85MxQ,1.000000000,0,0,200.00,USD,2021-11-10,,"ATM CASH DEPOSIT, *****3010005735",,,,,other,00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,Non-Income
14,1665,9589X8gvxATeNg3rY0pJhVjgoVOQ3niy1L4KL,1.000000000,0,0,60.00,USD,2022-09-01,,"ACH DEPOSIT, WISELY ACH P2PTRANSFR 8683261 J ROM",,,ACH,,other,00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,Other
16,1665,J04Op4qv1Ji873Ej6YVpIN0DeNkjmdskyavoX,1.000000000,0,0,1135.21,USD,2022-09-23,,"ACH DEPOSIT, CTC ACADEMY INC DIRECT DEP ****85...",,,ACH,,other,00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,Other
18,1665,XEoMpoJnX8t9bZqVMwOBsjxpQ73pq9i6EDyDD,1.000000000,0,0,300.00,USD,2022-03-01,,"ATM CASH DEPOSIT, *****3010005735",,,,,other,00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,Non-Income
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1962474,1201,4pne03LZwphYQDoBPNOySXj5AVeB8VC1vAw40,1.000000000,0,0,92.00,USD,2022-10-21,2022-10-21,Zayzoon Payout CHEYENNE WY 125560 10/2,Cheyenne,WY,,,other,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,Other
1962475,1201,Jz1BNvo4Lzij8J6QMdkri5kYZLx0aLIAe9jno,1.000000000,0,0,94.00,USD,2022-09-02,2022-09-02,Zayzoon Payout CHEYENNE WY 153665 09/0,Cheyenne,WY,,,other,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,Other
1962476,1201,XzXPRnNowziV9LMxp308H56PqVoxyVIO3rRZq,1.000000000,0,0,94.00,USD,2022-09-30,2022-09-30,Zayzoon Payout CHEYENNE WY 207757 09/3,Cheyenne,WY,,,other,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,Other
1962477,1201,6pe4EK17LphMrwY67NQ0uoM90xJOXxiV40P84,1.000000000,0,0,84.00,USD,2022-12-02,2022-12-02,Zayzoon Payout CHEYENNE WY 455830 12/0,Cheyenne,WY,,,other,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,Other


In [9]:
# Features: X (input variables)
# Assuming 'selected_data' contains all your data including features and target variable
# 'income_category' is your target variable, and other columns are features

X = selected_data.drop('income_category', axis=1)  # Drop the target variable to get features

# Target variable: y
y = selected_data[['name','date','income_category', 'account_id']]  # 'account_id' is part of the target variable

# Check shapes to ensure the split is correct
print("Shape of Features (X):", X.shape)
print("Shape of Target Variable (y):", y.shape)


Shape of Features (X): (353482, 17)
Shape of Target Variable (y): (353482, 4)


In [8]:
y
#Now lets try to segment this data based on dates. 
#since segmenting by key-words needs a more sofiticaded modeling...  

Unnamed: 0,name,date,income_category,account_id
4,"VISA TRANSFER, *****30100057355, AUT 112321 VI...",2021-11-24,Other Income,1665
11,"ATM CASH DEPOSIT, *****3010005735",2021-11-10,Non-Income,1665
14,"ACH DEPOSIT, WISELY ACH P2PTRANSFR 8683261 J ROM",2022-09-01,Other Income,1665
16,"ACH DEPOSIT, CTC ACADEMY INC DIRECT DEP ****85...",2022-09-23,Other Income,1665
18,"ATM CASH DEPOSIT, *****3010005735",2022-03-01,Non-Income,1665
...,...,...,...,...
1962474,Zayzoon Payout CHEYENNE WY 125560 10/2,2022-10-21,Other Income,1201
1962475,Zayzoon Payout CHEYENNE WY 153665 09/0,2022-09-02,Other Income,1201
1962476,Zayzoon Payout CHEYENNE WY 207757 09/3,2022-09-30,Other Income,1201
1962477,Zayzoon Payout CHEYENNE WY 455830 12/0,2022-12-02,Other Income,1201


In [8]:
from sklearn.preprocessing import LabelEncoder

# Create a copy of the 'account_id' column as 'account_id_string'
y['account_id_string'] = y['account_id'].copy()

# Create a label encoder object
label_encoder = LabelEncoder()

# Transform 'account_id' column to numerical values and replace the original column in 'y'
y['account_id'] = label_encoder.fit_transform(y['account_id'])

# Display the updated columns 'account_id' and 'account_id_string'
print(y[['account_id', 'account_id_string']].head())


    account_id account_id_string
4         1665     w4d61f7p31574
11        1665     w4d61f7p31574
14        1665     w4d61f7p31574
16        1665     w4d61f7p31574
18        1665     w4d61f7p31574


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['account_id_string'] = y['account_id'].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['account_id'] = label_encoder.fit_transform(y['account_id'])


## 2 - How would you calculate the next pay day for each user? How would you test the performance of model? Please provide your answer as a set of rules or a proposed machine learning model (ensemble will also work). 