In [2]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import time
import matplotlib.patches as mpatches
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD

In [226]:
#%%
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections

#%%
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score,precision_recall_curve
from collections import Counter

from sklearn.model_selection import KFold,StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, RobustScaler,OneHotEncoder
from scipy.stats import norm
from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve,classification_report, average_precision_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix


import warnings
warnings.filterwarnings("ignore")

In [227]:
# load the dataset
df = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [228]:
train = df.drop("FraudResult", axis=1)
label = df["FraudResult"]

In [229]:
all_data = pd.concat([train, test],axis=0)

In [230]:
all_data["TransactionStartTime"] = pd.to_datetime(all_data.TransactionStartTime)

In [231]:
# value_std_scaler = StandardScaler()
# # amount_std_scaler = StandardScaler()
# all_data["scaled_value"] = value_std_scaler.fit_transform(all_data.Value.values.reshape(-1,1))

In [232]:
label_encoder = LabelEncoder()
all_data["ProductCategory"] = label_encoder.fit_transform(all_data.ProductCategory)

In [233]:

# all_data.drop(["Value", "Amount"], axis=1, inplace=True)

all_data.head(1)

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,0,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2


In [234]:
# create features from datetime
all_data["hour"] = all_data["TransactionStartTime"].dt.hour
all_data["day"] = all_data["TransactionStartTime"].dt.day
all_data["week"] = all_data["TransactionStartTime"].dt.week
all_data["month"] = all_data["TransactionStartTime"].dt.month
all_data["year"] = all_data["TransactionStartTime"].dt.year
all_data["minute"] = all_data["TransactionStartTime"].dt.minute

all_data.drop(['TransactionStartTime'], axis=1, inplace=True)
all_data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,hour,day,week,month,year,minute
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,0,ChannelId_3,1000.0,1000,2,2,15,46,11,2018,18
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,2,ChannelId_2,-20.0,20,2,2,15,46,11,2018,19
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,0,ChannelId_3,500.0,500,2,2,15,46,11,2018,44
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,9,ChannelId_3,20000.0,21800,2,3,15,46,11,2018,32
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,2,ChannelId_2,-644.0,644,2,3,15,46,11,2018,34


In [248]:
all_data.columns
col_to_remove = ['BatchId', 'AccountId', 'SubscriptionId',
                'CustomerId','ProviderId', 'ProductId','ProviderId', 'ProductId'
                ,'ChannelId',"CurrencyCode","CountryCode"]


all_data.drop(col_to_remove, axis=1, inplace=True)

In [249]:
X = all_data[:len(train)]
test_data = all_data[len(test):]

In [250]:
new_df = X.drop("TransactionId", axis=1)

# df["FraudResult"] = label
# Splitting the dataset
y = df["FraudResult"]

In [251]:
x_train, x_test, y_train, y_test = train_test_split(new_df.values, y.values, 
                                        test_size=0.2, random_state=42)

In [252]:
x_train

array([[ 2.000e+00, -1.000e+02,  1.000e+02, ...,  1.000e+00,  2.019e+03,
         2.100e+01],
       [ 0.000e+00,  5.000e+03,  5.000e+03, ...,  1.200e+01,  2.018e+03,
         1.800e+01],
       [ 0.000e+00,  1.000e+03,  1.000e+03, ...,  1.000e+00,  2.019e+03,
         2.900e+01],
       ...,
       [ 1.000e+00,  2.000e+03,  2.000e+03, ...,  1.000e+00,  2.019e+03,
         3.900e+01],
       [ 9.000e+00, -1.000e+04,  1.120e+04, ...,  1.100e+01,  2.018e+03,
         5.100e+01],
       [ 0.000e+00, -2.500e+04,  2.500e+04, ...,  1.200e+01,  2.018e+03,
         1.600e+01]])

In [253]:
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

log_reg_params = {"penalty": ['l1', 'l2'], 
                  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

log_reg_sm = LogisticRegression()
rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)

In [254]:
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [255]:
print('Length of X (train): {} | Length of y (train): {}'.format(len(x_train), len(y_train)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(x_test), len(y_test)))

Length of X (train): 76529 | Length of y (train): 76529
Length of X (test): 19133 | Length of y (test): 19133


In [256]:
from tqdm import tqdm

In [257]:
for ss_train, ss_test in tqdm(sss.split(x_train, y_train)):
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), cat) # SMOTE happens during Cross Validation not before..
    model = pipeline.fit(x_train[ss_train], y_train[ss_train])
    best_est = cat.best_estimator_
    prediction = best_est.predict(x_train[ss_test])
    
    accuracy_lst.append(pipeline.score(x_train[ss_test], y_train[ss_test]))
    precision_lst.append(precision_score(y_train[ss_test], prediction))
    recall_lst.append(recall_score(y_train[ss_test], prediction))
    f1_lst.append(f1_score(y_train[ss_test], prediction))
    auc_lst.append(roc_auc_score(y_train[ss_test], prediction))
    
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)



0it [00:00, ?it/s][A[A

Learning rate set to 0.065277
0:	learn: 0.5281437	total: 159ms	remaining: 2m 38s
1:	learn: 0.3723228	total: 260ms	remaining: 2m 9s
2:	learn: 0.2626170	total: 344ms	remaining: 1m 54s
3:	learn: 0.1919052	total: 452ms	remaining: 1m 52s
4:	learn: 0.1373545	total: 542ms	remaining: 1m 47s
5:	learn: 0.1002033	total: 628ms	remaining: 1m 44s
6:	learn: 0.0753135	total: 739ms	remaining: 1m 44s
7:	learn: 0.0589824	total: 825ms	remaining: 1m 42s
8:	learn: 0.0468971	total: 912ms	remaining: 1m 40s
9:	learn: 0.0387835	total: 1.02s	remaining: 1m 40s
10:	learn: 0.0328131	total: 1.1s	remaining: 1m 38s
11:	learn: 0.0276959	total: 1.19s	remaining: 1m 37s
12:	learn: 0.0242984	total: 1.29s	remaining: 1m 38s
13:	learn: 0.0216912	total: 1.38s	remaining: 1m 37s
14:	learn: 0.0190684	total: 1.47s	remaining: 1m 36s
15:	learn: 0.0172398	total: 1.57s	remaining: 1m 36s
16:	learn: 0.0158333	total: 1.66s	remaining: 1m 35s
17:	learn: 0.0142814	total: 1.74s	remaining: 1m 35s
18:	learn: 0.0131828	total: 1.88s	remaining: 1

AttributeError: 'CatBoostClassifier' object has no attribute 'best_estimator_'

In [163]:
labels = ['No Fraud', 'Fraud']
smote_prediction = best_est.predict(x_test)
print(classification_report(y_test, smote_prediction, target_names=labels))

              precision    recall  f1-score   support

    No Fraud       1.00      1.00      1.00     19097
       Fraud       0.27      1.00      0.43        36

    accuracy                           1.00     19133
   macro avg       0.64      1.00      0.71     19133
weighted avg       1.00      1.00      1.00     19133



In [247]:
# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE(ratio='minority', random_state=42)
# Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)


# This will be the data were we are going to 
Xsm_train, ysm_train = sm.fit_sample(new_df, y)

ValueError: could not convert string to float: 'BatchId_36123'

In [200]:
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)

grid_log_reg.fit(new_df,y)

In [202]:
# Logistic Regression
t0 = time.time()
log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm.fit(Xsm_train, ysm_train)
t1 = time.time()
print("Fitting oversample data took :{} sec".format(t1 - t0))

Fitting oversample data took :1.5764820575714111 sec


In [245]:
cat = CatBoostClassifier()

In [213]:
cat.fit(Xsm_train, ysm_train)

Learning rate set to 0.074066
0:	learn: 0.4706906	total: 257ms	remaining: 4m 16s
1:	learn: 0.3056429	total: 383ms	remaining: 3m 11s
2:	learn: 0.2106434	total: 545ms	remaining: 3m 1s
3:	learn: 0.1480864	total: 672ms	remaining: 2m 47s
4:	learn: 0.1049032	total: 835ms	remaining: 2m 46s
5:	learn: 0.0780357	total: 1.01s	remaining: 2m 47s
6:	learn: 0.0592801	total: 1.17s	remaining: 2m 45s
7:	learn: 0.0464639	total: 1.29s	remaining: 2m 39s
8:	learn: 0.0378209	total: 1.43s	remaining: 2m 37s
9:	learn: 0.0316594	total: 1.55s	remaining: 2m 33s
10:	learn: 0.0275115	total: 1.69s	remaining: 2m 31s
11:	learn: 0.0233265	total: 1.81s	remaining: 2m 29s
12:	learn: 0.0208331	total: 1.95s	remaining: 2m 27s
13:	learn: 0.0187466	total: 2.25s	remaining: 2m 38s
14:	learn: 0.0167009	total: 2.37s	remaining: 2m 35s
15:	learn: 0.0150467	total: 2.6s	remaining: 2m 39s
16:	learn: 0.0139824	total: 2.83s	remaining: 2m 43s
17:	learn: 0.0128264	total: 3.02s	remaining: 2m 44s
18:	learn: 0.0120047	total: 3.23s	remaining: 2

<catboost.core.CatBoostClassifier at 0x7fb38f28ac50>

In [214]:
print(classification_report(y, cat.predict(new_df)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     95469
           1       0.96      0.99      0.97       193

    accuracy                           1.00     95662
   macro avg       0.98      0.99      0.99     95662
weighted avg       1.00      1.00      1.00     95662



In [215]:
new_test = all_data[len(train):]

In [216]:
all_data.shape[0] - len(new_test)

95662

In [217]:
id = new_test["TransactionId"]

In [222]:
pred = cat.predict(new_test.drop("TransactionId", axis=1))

In [223]:
submission = pd.DataFrame(data=id,columns=["TransactionId"])
submission["FraudResult"] = pred

In [224]:
submission.head(2)



Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0.0
1,TransactionId_95109,0.0


In [225]:
submission.to_csv("submission/catboost_results.csv",index=False)