In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/training.csv')
df.head()

Unnamed: 0,txkey,locdt,loctm,chid,cano,contp,etymd,mchno,acqic,mcc,...,stocn,scity,stscd,ovrlt,flbmk,hcefg,csmcu,csmam,flg_3dsmk,label
0,1c09727c939eb69ead2a4ce4072b8aa18992a64f01fcb4...,46,101812,84d2dc85d4da6a7fa284a11a4290d7e9a969163dcb4d82...,3dd5bf1e29e5e0baa789ce692fe5dbd34ff05173acf351...,5,1.0,cad752c5d05d2bdcc30d64fa4e68404c2d1f7be5d14d52...,8f6b3ff512a001e0d1988c6cd888ef8c74112fb71117e5...,375.0,...,0.0,15759.0,,0,0,6.0,70.0,1529,0,0
1,2043f245a93bc6328dac964d6dbc89f13a0346062c194d...,17,85509,9a8cf5d0afd729cb7876f6b3172152c7c9c6fabd40515c...,8cb13f9b38c7bbc02d210e580dcbbcbb6c95bf18bc3320...,5,8.0,4356c6642ef6e837543b577c7ee3ffa92b4b8fcfb57254...,379166ff4a62dac343b4b734188aa618716cc496e48b65...,282.0,...,0.0,15759.0,,0,0,8.0,70.0,101,0,0
2,e4853710290a8409279f3279f8032ae670824bd19aa173...,44,212641,dcc1389a5765d6f53152cf85970fbe78a83fd3d1c299b0...,1ec32868e5e1d5ff8df56737c2a91326cbfe3364382de6...,5,4.0,5b7eff061f8896aac4339ea35c25f8bb956a43bc486460...,8f6b3ff512a001e0d1988c6cd888ef8c74112fb71117e5...,288.0,...,0.0,15759.0,,0,0,6.0,70.0,116,0,0
3,74d811b1dbc28b22b73ba2c79bb6033791d913b6d27a25...,42,102702,577f2329d1eccd59ba0abaf6113bb78dcd575badcbc57f...,4359dca1ac6a835eceb2bc0dd6b0b710f030c3499126e9...,5,1.0,cad752c5d05d2bdcc30d64fa4e68404c2d1f7be5d14d52...,36684976be1f529e6e2a32c9edab4cf8e364b2b916ae2c...,375.0,...,0.0,15759.0,,0,0,6.0,70.0,1683,0,0
4,68ca182343969d429d79a34e532bc1ca7a3cc032c2ad81...,31,185737,fff6b4126c40620b1fbb11d4de02cd67b9e95071caa40b...,a3837f2905383f235a72679482c5f02e40f2a8ca29750d...,5,5.0,50d5b02ce3fc88723438c2a29cfdb04be4a1a11280ddb6...,379166ff4a62dac343b4b734188aa618716cc496e48b65...,406.0,...,0.0,15759.0,,0,0,6.0,70.0,14,0,0


# locdt loctm
Normalize loctm, for example 120000 => 0.5

In [3]:
df['normalized_loctm'] = df['loctm'] // 10000 * 3600 + (df['loctm'] % 10000)//100 * 60 + df['loctm'] % 100
df['normalized_loctm']=(df['normalized_loctm']-df['normalized_loctm'].min())/(df['normalized_loctm'].max()-df['normalized_loctm'].min()+1)

In [4]:
normalized_loctm = df['normalized_loctm']
df.drop(['loctm', 'normalized_loctm'], axis=1, inplace=True)
df.insert(1, 'loctm', normalized_loctm)
df.head()

Unnamed: 0,txkey,loctm,locdt,chid,cano,contp,etymd,mchno,acqic,mcc,...,stocn,scity,stscd,ovrlt,flbmk,hcefg,csmcu,csmam,flg_3dsmk,label
0,1c09727c939eb69ead2a4ce4072b8aa18992a64f01fcb4...,0.429306,46,84d2dc85d4da6a7fa284a11a4290d7e9a969163dcb4d82...,3dd5bf1e29e5e0baa789ce692fe5dbd34ff05173acf351...,5,1.0,cad752c5d05d2bdcc30d64fa4e68404c2d1f7be5d14d52...,8f6b3ff512a001e0d1988c6cd888ef8c74112fb71117e5...,375.0,...,0.0,15759.0,,0,0,6.0,70.0,1529,0,0
1,2043f245a93bc6328dac964d6dbc89f13a0346062c194d...,0.371632,17,9a8cf5d0afd729cb7876f6b3172152c7c9c6fabd40515c...,8cb13f9b38c7bbc02d210e580dcbbcbb6c95bf18bc3320...,5,8.0,4356c6642ef6e837543b577c7ee3ffa92b4b8fcfb57254...,379166ff4a62dac343b4b734188aa618716cc496e48b65...,282.0,...,0.0,15759.0,,0,0,8.0,70.0,101,0,0
2,e4853710290a8409279f3279f8032ae670824bd19aa173...,0.89353,44,dcc1389a5765d6f53152cf85970fbe78a83fd3d1c299b0...,1ec32868e5e1d5ff8df56737c2a91326cbfe3364382de6...,5,4.0,5b7eff061f8896aac4339ea35c25f8bb956a43bc486460...,8f6b3ff512a001e0d1988c6cd888ef8c74112fb71117e5...,288.0,...,0.0,15759.0,,0,0,6.0,70.0,116,0,0
3,74d811b1dbc28b22b73ba2c79bb6033791d913b6d27a25...,0.43544,42,577f2329d1eccd59ba0abaf6113bb78dcd575badcbc57f...,4359dca1ac6a835eceb2bc0dd6b0b710f030c3499126e9...,5,1.0,cad752c5d05d2bdcc30d64fa4e68404c2d1f7be5d14d52...,36684976be1f529e6e2a32c9edab4cf8e364b2b916ae2c...,375.0,...,0.0,15759.0,,0,0,6.0,70.0,1683,0,0
4,68ca182343969d429d79a34e532bc1ca7a3cc032c2ad81...,0.790012,31,fff6b4126c40620b1fbb11d4de02cd67b9e95071caa40b...,a3837f2905383f235a72679482c5f02e40f2a8ca29750d...,5,5.0,50d5b02ce3fc88723438c2a29cfdb04be4a1a11280ddb6...,379166ff4a62dac343b4b734188aa618716cc496e48b65...,406.0,...,0.0,15759.0,,0,0,6.0,70.0,14,0,0


# Original

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold


print('No Frauds', round(df['label'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['label'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

X = df.drop('label', axis=1)
y = df['label']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]


No Frauds 99.63 % of the dataset
Frauds 0.37 % of the dataset
Train: [1722669 1722851 1722891 ... 8688523 8688524 8688525] Test: [      0       1       2 ... 1737758 1737759 1737760]
Train: [      0       1       2 ... 8688523 8688524 8688525] Test: [1722669 1722851 1722891 ... 3475467 3475468 3475469]
Train: [      0       1       2 ... 8688523 8688524 8688525] Test: [3461625 3461631 3461936 ... 5213249 5213250 5213251]
Train: [      0       1       2 ... 8688523 8688524 8688525] Test: [5175924 5176029 5176360 ... 6950945 6950946 6950947]
Train: [      0       1       2 ... 6950945 6950946 6950947] Test: [6919499 6919679 6919898 ... 8688523 8688524 8688525]


In [6]:
from sklearn.utils import resample

original_data = pd.concat([original_Xtrain,  original_ytrain], axis=1)
min_class_size = len(original_data[original_data['label']==1])

maj_class = original_data[original_data['label']==0]
min_class = original_data[original_data['label']==1]

min_samples = int(1000000 * (min_class_size/len(original_data)))
maj_samples = 1000000 - min_samples

maj = resample(maj_class, replace=False, n_samples=maj_samples, random_state=42)
min = resample(min_class, replace=False, n_samples=min_samples, random_state=42)

stratified = pd.concat([maj, min])
original_Xtrain = stratified.drop('label', axis=1)
original_ytrain = stratified['label']

# Transform object values

In [7]:
cats = ['txkey', 'chid', 'cano', 'mchno', 'acqic']

ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoded_data = ord_encoder.fit_transform(original_Xtrain[cats])

original_Xtrain[cats] = encoded_data

In [8]:
original_Xtrain.dtypes

txkey        float64
loctm        float64
locdt          int64
chid         float64
cano         float64
contp          int64
etymd        float64
mchno        float64
acqic        float64
mcc          float64
conam        float64
ecfg           int64
insfg          int64
iterm        float64
bnsfg          int64
flam1          int64
stocn        float64
scity        float64
stscd        float64
ovrlt          int64
flbmk          int64
hcefg        float64
csmcu        float64
csmam          int64
flg_3dsmk      int64
dtype: object

# Fill nan values

In [9]:
na = ['etymd', 'mcc', 'stocn', 'scity', 'stscd', 'hcefg', 'csmcu']

simple_imputer = SimpleImputer(strategy="most_frequent")
imputed_data = simple_imputer.fit_transform(original_Xtrain[na])

original_Xtrain[na] = imputed_data

In [10]:
original_Xtrain.isna().sum()

txkey        0
loctm        0
locdt        0
chid         0
cano         0
contp        0
etymd        0
mchno        0
acqic        0
mcc          0
conam        0
ecfg         0
insfg        0
iterm        0
bnsfg        0
flam1        0
stocn        0
scity        0
stscd        0
ovrlt        0
flbmk        0
hcefg        0
csmcu        0
csmam        0
flg_3dsmk    0
dtype: int64

# Feature selection
### numeric features: 
locdt, loctm, conam, iterm, flam1, csmam

## First see the relationship between 3 money-related features

## Summary
* conma and flam1 are highly correlated so we choose one

In [11]:
original_Xtrain[['conam', 'flam1', 'csmam']].corr()

Unnamed: 0,conam,flam1,csmam
conam,1.0,0.999919,0.093605
flam1,0.999919,1.0,0.093638
csmam,0.093605,0.093638,1.0


In [12]:
num_attribs = ['locdt', 'loctm', 'conam', 'iterm', 'csmam']

### categorical features
chid, cano, contp, etymd, mchno, acqic, mcc, ecfg, insfg, bnsfg, stocn, scity, stscd, ovrlt, flbmk, hcefg, csmcu, flg_3dsmk

## See if stocn and scity are correlated
### Summary
* p-value < 0.05
* 

In [13]:
from sklearn.feature_selection import chi2

chi2(original_Xtrain['stocn'].values.reshape(-1, 1), original_Xtrain['scity'])

(array([83241429.8041404]), array([0.]))

In [14]:
from sklearn.feature_selection import SelectKBest


cat_attribs = ['chid', 'cano', 'contp', 'etymd', 'mchno', 'acqic', 'mcc', 'ecfg', 'insfg', 'bnsfg', 'scity', 'stscd', 
               'ovrlt', 'flbmk', 'hcefg', 'csmcu', 'flg_3dsmk']
cat_original_Xtrain = original_Xtrain[cat_attribs]
selector = SelectKBest(chi2, k=len(cat_attribs)//2)
selector.fit(cat_original_Xtrain, original_ytrain)

In [15]:
cols_idxs = selector.get_support(indices=True)
new_cat_original_Xtrain = cat_original_Xtrain.iloc[:,cols_idxs]

In [16]:
selected_original_Xtrain = pd.concat([new_cat_original_Xtrain, original_Xtrain[num_attribs]], axis=1)
selected_columns = selected_original_Xtrain.columns
selected_columns

Index(['chid', 'cano', 'mchno', 'acqic', 'mcc', 'ecfg', 'scity', 'csmcu',
       'locdt', 'loctm', 'conam', 'iterm', 'csmam'],
      dtype='object')

# Smote

In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))

# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_

selected_original_Xtrain = selected_original_Xtrain.values
selected_original_Xtest = original_Xtest[selected_columns].values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values



Length of X (train): 1000000 | Length of y (train): 1000000
Length of X (test): 1737705 | Length of y (test): 1737705


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

log_reg_params = {"penalty": ['l2'], 'C': [0.01, 0.1, 1, 10, 100]}

# rand_log_reg = RandomizedSearchCV(LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1), log_reg_params, n_iter=3, n_jobs=-1)
log_reg = LogisticRegression(max_iter=5000, solver='saga', n_jobs=-1)

robust_scaler = RobustScaler()


for index, (train, test) in enumerate(sss.split(selected_original_Xtrain, original_ytrain)):
    print(index)
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy=0.75, k_neighbors=4), log_reg) # SMOTE happens during Cross Validation not before..
    X_train = robust_scaler.fit_transform(selected_original_Xtrain[train])
    model = pipeline.fit(X_train, original_ytrain[train])
    prediction = log_reg.predict(selected_original_Xtrain[test])
    
    accuracy_lst.append(pipeline.score(selected_original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))

0




1




2




3




4




In [19]:
from joblib import dump, load
dump(log_reg, 'trained_models/log_reg_v3_smote_0.8_k4.joblib') 

['trained_models/log_reg_v3_smote_0.8_k4.joblib']

In [20]:
print(f'accuracy {np.mean(accuracy_lst)}')
print(f'precision {np.mean(precision_lst)}')
print(f'recall {np.mean(recall_lst)}')
print(f'f1 {np.mean(f1_lst)}')
print(f'auc {np.mean(auc_lst)}')

accuracy 0.7230889999999999
precision 0.005646617196464971
recall 0.4167429666155549
f1 0.011131395453720374
auc 0.5704827764143134
