In [1]:
import numpy as np
import pandas as pd
import pyarrow
import fastparquet
from pandasql import sqldf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import string

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer




In [2]:
#Data pre processing: ....copied from before
transactions = pd.read_parquet("data_science_take_home_transactions.parquet", engine='pyarrow')
advances = pd.read_parquet("data_science_take_home_advances.parquet", engine='pyarrow')

###1. Convert 'account_id' to numerical values
# Create a copy of the 'account_id' column as 'account_id_string'
transactions['account_id_string'] = transactions['account_id'].copy()
advances['account_id_string'] = advances['account_id'].copy()

label_encoder = LabelEncoder() # Create a label encoder object

transactions['account_id'] = label_encoder.fit_transform(transactions['account_id']) # Transform 'account_id' column to numerical values and replace the original column 
advances['account_id'] = label_encoder.fit_transform(advances['account_id'])

###2. Only looking at postivie numbers and ignoring charges (for now?).
#Noticed that for payment_channel: "online" and "in-store" can be filtered out bc they seem to exclude salaries.
selected_data = transactions[(transactions['payment_channel'] != 'online') &
                             (transactions['payment_channel'] != 'in store') &
                             (transactions['f0_'] > 0)]
###3. Lets ignore irrelevant columns: 

columns_to_remove = ['version', 'currency_code', 'pending', 'removed', 'selected_data', 'city', 'region', 'payment_meta_payment_processor',
                    'authorized_date', 'payment_meta_payment_method', 'payment_channel']
cleaned_data = selected_data.drop(columns=columns_to_remove, axis=1, errors='ignore')

# Let's ensure the 'date' column is converted to a datetime format before using the .dt accessor. 
cleaned_data['date'] = pd.to_datetime(cleaned_data['date'])

# Feature Engineering: Extract features from dates and descriptions
cleaned_data['day_of_week'] = cleaned_data['date'].dt.dayofweek
cleaned_data['week_number'] = cleaned_data['date'].dt.isocalendar().week
cleaned_data['month'] = cleaned_data['date'].dt.month

cleaned_data

Unnamed: 0,account_id,id,f0_,date,name,bank_account_id,account_id_string,day_of_week,week_number,month
4,1665,bQjKpjODw8cLVEZB6kD9fez4dN04ZQs9nVoz0,54.18,2021-11-24,"VISA TRANSFER, *****30100057355, AUT 112321 VI...",00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,2,47,11
11,1665,Q87Qp7vXkJTxJjQZb97NtaMyELjyQDTJ85MxQ,200.00,2021-11-10,"ATM CASH DEPOSIT, *****3010005735",00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,2,45,11
14,1665,9589X8gvxATeNg3rY0pJhVjgoVOQ3niy1L4KL,60.00,2022-09-01,"ACH DEPOSIT, WISELY ACH P2PTRANSFR 8683261 J ROM",00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,3,35,9
16,1665,J04Op4qv1Ji873Ej6YVpIN0DeNkjmdskyavoX,1135.21,2022-09-23,"ACH DEPOSIT, CTC ACADEMY INC DIRECT DEP ****85...",00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,4,38,9
18,1665,XEoMpoJnX8t9bZqVMwOBsjxpQ73pq9i6EDyDD,300.00,2022-03-01,"ATM CASH DEPOSIT, *****3010005735",00kEXk41nDi5v4yjokE7TqDA4jxAeVhdDYMKz,w4d61f7p31574,1,9,3
...,...,...,...,...,...,...,...,...,...,...
1962474,1201,4pne03LZwphYQDoBPNOySXj5AVeB8VC1vAw40,92.00,2022-10-21,Zayzoon Payout CHEYENNE WY 125560 10/2,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,4,42,10
1962475,1201,Jz1BNvo4Lzij8J6QMdkri5kYZLx0aLIAe9jno,94.00,2022-09-02,Zayzoon Payout CHEYENNE WY 153665 09/0,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,4,35,9
1962476,1201,XzXPRnNowziV9LMxp308H56PqVoxyVIO3rRZq,94.00,2022-09-30,Zayzoon Payout CHEYENNE WY 207757 09/3,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,4,39,9
1962477,1201,6pe4EK17LphMrwY67NQ0uoM90xJOXxiV40P84,84.00,2022-12-02,Zayzoon Payout CHEYENNE WY 455830 12/0,Dzx6EBa4mziyMXKjV0ZxSngk5QNXJzHgx9jZK,kv8gyng97yyg2,4,48,12
