In [23]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
transactions = pd.read_csv('../data/twm_transactions.csv', delimiter=';')
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77580 entries, 0 to 77579
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tran_id        77580 non-null  int64  
 1   acct_nbr       77580 non-null  int64  
 2   tran_amt       77580 non-null  float64
 3   principal_amt  77580 non-null  float64
 4   interest_amt   77580 non-null  float64
 5   new_balance    77580 non-null  float64
 6   tran_date      77580 non-null  object 
 7   tran_time      77580 non-null  int64  
 8   channel        77580 non-null  object 
 9   tran_code      77580 non-null  object 
dtypes: float64(4), int64(3), object(3)
memory usage: 5.9+ MB


In [25]:
# id should be dropped
transactions.tran_id.nunique()

155

In [26]:
#tran_code should be hot_encoded
transactions.tran_code.nunique()

13

In [27]:
#tran_channel should be hot_encoded
transactions.channel.nunique()

11

In [28]:
# let's drop date and time, not important, we should also group by acct

# Cleaning

In [29]:
transactions = transactions.drop(['tran_id','tran_date','tran_time','acct_nbr'], axis=1)
transactions

Unnamed: 0,tran_amt,principal_amt,interest_amt,new_balance,channel,tran_code
0,0.00,0.00,0.0,3753.34,A,IQ
1,0.00,0.00,0.0,254.49,V,IQ
2,-97.57,-97.57,0.0,3819.56,P,WD
3,-0.15,-0.15,0.0,224.05,,FK
4,0.00,0.00,0.0,240.55,B,IQ
...,...,...,...,...,...,...
77575,-95.71,-95.71,0.0,95.71,A,TR
77576,-93.90,-93.90,0.0,824.36,P,WD
77577,-10.35,-10.35,0.0,10.36,P,WD
77578,-423.80,-423.80,0.0,162.28,E,WD


In [30]:
# # group by acct_nbr
# acct_transactions = transactions.groupby('acct_nbr').mean()#.drop('new_balance', axis=1)
# acct_transactions

In [31]:
# # add transaction counts
# acct_transactions['tran_count'] = transactions.groupby('acct_nbr').count()['tran_amt']
# acct_transactions

# Scaling

In [32]:
# Scaling
from sklearn.preprocessing import StandardScaler

num_feats = transactions.dtypes[(transactions.dtypes != 'object')].index.tolist()
scaler = StandardScaler()
scaled_transactions = pd.DataFrame(scaler.fit_transform(transactions[num_feats].astype(float)))

In [33]:
# add columns back

scaled_transactions.columns = num_feats
scaled_transactions

Unnamed: 0,tran_amt,principal_amt,interest_amt,new_balance
0,0.024780,0.027362,-0.172302,1.473768
1,0.024780,0.027362,-0.172302,-0.146869
2,-0.287262,-0.286364,-0.172302,1.504441
3,0.024300,0.026880,-0.172302,-0.160969
4,0.024780,0.027362,-0.172302,-0.153326
...,...,...,...,...
77575,-0.281313,-0.280383,-0.172302,-0.220415
77576,-0.275524,-0.274563,-0.172302,0.117090
77577,-0.008321,-0.005917,-0.172302,-0.259948
77578,-1.330588,-1.335319,-0.172302,-0.189580


In [34]:
# export

scaled_transactions.to_csv('../data/acct_transactions.csv')