# To Do (project)
- [x] Environment preparation
- [x] Load Data
- [x] EDA
- [ ] Prepare train and test datasets
- [ ]Selection model
- [ ] Consteuction model and training
- [ ] Evalate model

# Preparation

In [1]:
# These are my default settings
import warnings
warnings.filterwarnings("ignore")

import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import joblib

plt.rcParams["figure.figsize"] = (12, 6)
sns.set()
pd.set_option("display.max_columns", None)

import zipfile
import shutil

# optimized sklearn patch from intel distribution
from sklearnex import unpatch_sklearn
unpatch_sklearn()

# Loading data

In [2]:
df = pd.read_csv("creditcard.csv")
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


# EDA

In [3]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
def check_dataframe(df):
    print(f"Checking for df...")
    check_null_values(df)
    if check_null_values(df) == True:
        show_null_rows(df)
    

def check_null_values(df):
    null = df.isnull().values.sum() != 0
    print(f"Null values: {null}")

def show_null_rows(df):

    null = df[df.isnull().any(axis=1)]
    print(f"Null rows: {null}")


In [5]:
test_df = pd.DataFrame({
    'column1': [1, 2, np.nan, 4, 5],
    'column2': [np.nan, 7, 8, 9, 10],
    'column3': [11, 12, 13, np.nan, 15]
    })
show_null_rows(test_df)

Null rows:    column1  column2  column3
0      1.0      NaN     11.0
2      NaN      8.0     13.0
3      4.0      9.0      NaN


In [6]:
check_dataframe(df)

Checking for df...
Null values: False
Null values: False


In [7]:
df["Class"].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [8]:
df.describe(include="all")

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,2.239053e-15,1.673327e-15,-1.247012e-15,8.190001e-16,1.207294e-15,4.887456e-15,1.437716e-15,-3.772171e-16,9.564149e-16,1.039917e-15,6.406204e-16,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,1.08885,1.020713,0.9992014,0.9952742,0.9585956,0.915316,0.8762529,0.8493371,0.8381762,0.8140405,0.770925,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,-24.58826,-4.797473,-18.68371,-5.791881,-19.21433,-4.498945,-14.12985,-25.1628,-9.498746,-7.213527,-54.49772,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,-0.5354257,-0.7624942,-0.4055715,-0.6485393,-0.425574,-0.5828843,-0.4680368,-0.4837483,-0.4988498,-0.4562989,-0.2117214,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,-0.09291738,-0.03275735,0.1400326,-0.01356806,0.05060132,0.04807155,0.06641332,-0.06567575,-0.003636312,0.003734823,-0.06248109,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,0.4539234,0.7395934,0.618238,0.662505,0.4931498,0.6488208,0.5232963,0.399675,0.5008067,0.4589494,0.1330408,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,23.74514,12.01891,7.848392,7.126883,10.52677,8.877742,17.31511,9.253526,5.041069,5.591971,39.4209,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [9]:
# df.iloc[:,1:-1].plot()

In [10]:
print("Qty of Normal Transactions:",df["Class"].value_counts()[0], round(df["Class"].value_counts()[0]/len(df) * 100,2), "%",
      "\nQty of Fraud Transactions :",df["Class"].value_counts()[1], "   ",round(df["Class"].value_counts()[1]/len(df) * 100,2), "%")

Qty of Normal Transactions: 284315 99.83 % 
Qty of Fraud Transactions : 492     0.17 %


# Mini summery of first data analysis
- df is already scaled and fitted with PCA
- There is no null values
- It contains normal transactions labeled "0" and fraud transactions labeled "1"
- Inbalanced dataframe. There is only 0.17% fraud data.

- Now it's the time to analyze. As we can observe, the dataset is much inblanced. 

To Do
- Prepare train dataset with ONLY non-fraud data
- Prepare test dataset with 50/50 fraud/non-fraud data
- Pre-processing

# Preparing datasets

### Pre-processing

In [11]:
def drop_duplicated_values(dataframe):
    df_clean = dataframe.drop_duplicates()
    return df_clean

In [12]:
df_2 = drop_duplicated_values(df)
df_2

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [13]:
df_2.duplicated().sum()

0

In [14]:
df_2 = df_2.drop("Time", axis=1)

In [15]:
df_2["Class"].replace([0,1],[1,-1],inplace=True)

In [16]:
print(len(df_2[df_2["Class"]==1]))
print(len(df_2[df_2["Class"]==-1]))

283253
473


In [19]:
def inbalanced_dataset_separator(dataframe):
    """
    This function separates the inbalanced dataset into 2 datasets.
    Train dataset contains ONLY "0" class and test dataset contains 0 and 1 in 50/50 ratio
    """
    fraud = dataframe[dataframe["Class"] == -1]
    normal = dataframe[dataframe["Class"] == 1]
    
    normal_for_test = normal.sample(len(fraud))
    train_data = normal.drop(normal_for_test.index)
    test_data = pd.concat([normal_for_test, fraud], axis=0, ignore_index=True)
        
    return train_data, test_data


train, test = inbalanced_dataset_separator(df_2)
print("Class distribution for train dataset: ",train["Class"].value_counts())
print("Class distribution for test dataset: ",test["Class"].value_counts())

Class distribution for train dataset:  Class
1    282780
Name: count, dtype: int64
Class distribution for test dataset:  Class
 1    473
-1    473
Name: count, dtype: int64


- To limit the time cost to traning, I use only 10000 train data.

In [20]:
mini_train = train.sample(12000, ignore_index=True, random_state=42)
mini_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      12000 non-null  float64
 1   V2      12000 non-null  float64
 2   V3      12000 non-null  float64
 3   V4      12000 non-null  float64
 4   V5      12000 non-null  float64
 5   V6      12000 non-null  float64
 6   V7      12000 non-null  float64
 7   V8      12000 non-null  float64
 8   V9      12000 non-null  float64
 9   V10     12000 non-null  float64
 10  V11     12000 non-null  float64
 11  V12     12000 non-null  float64
 12  V13     12000 non-null  float64
 13  V14     12000 non-null  float64
 14  V15     12000 non-null  float64
 15  V16     12000 non-null  float64
 16  V17     12000 non-null  float64
 17  V18     12000 non-null  float64
 18  V19     12000 non-null  float64
 19  V20     12000 non-null  float64
 20  V21     12000 non-null  float64
 21  V22     12000 non-null  float64
 22

In [21]:
mini_train.iloc[:,:-1]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.708395,1.352380,-1.161618,-1.334094,1.431106,-1.260947,1.897063,-0.615065,0.319751,0.650948,-0.930312,-0.251960,-0.169452,0.257465,-0.036609,-0.712032,-0.639491,-0.366023,-0.128980,0.390074,0.070116,0.793768,-0.230717,0.623117,-0.120701,0.050731,0.531330,0.309823,30.27
1,-0.612656,-0.024648,1.992435,-0.230090,-0.394007,-0.436597,-0.471168,-0.026024,0.275014,0.032285,0.661783,-3.462034,0.769655,1.126601,-0.527588,-0.304064,2.244748,-1.664424,1.437391,0.032406,-0.044812,0.195288,-0.168174,0.361664,-0.172597,-0.158272,0.018760,0.087493,5.00
2,-0.929516,0.887188,1.964000,0.203124,0.326279,0.168336,0.986008,0.181211,-0.727805,-0.787465,1.176521,1.094447,-0.094719,0.070941,-1.422769,-0.443962,-0.147725,-0.477567,-0.658339,-0.076921,0.038254,0.159410,-0.274250,0.224808,0.641742,-0.428766,0.041691,0.051748,45.00
3,1.190386,0.012488,-0.131697,1.468835,1.746545,4.321814,-1.132499,1.133170,-0.135138,0.617365,-0.634402,-0.128592,0.064061,-0.173426,0.326777,1.111171,-1.073914,0.515597,-0.691107,-0.028026,0.050113,0.112778,-0.080734,1.012680,0.553668,0.122146,0.041282,0.027281,1.00
4,1.031955,-1.149606,0.359049,-0.770108,-1.043439,0.054815,-0.704699,0.153923,-0.820395,0.683561,1.288322,-0.316567,-0.612327,0.177602,0.592940,1.375145,0.129346,-1.140591,0.533503,0.330889,0.134111,-0.119456,-0.050851,-0.325272,0.098555,-0.392976,-0.005102,0.033088,166.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,-0.930809,1.219515,1.221082,1.464787,-0.856444,0.263613,0.508965,0.445400,-0.286895,-0.330945,-0.473809,0.730550,0.444116,0.017660,-0.014963,-1.168975,0.872649,-0.680015,0.545323,0.030725,0.120178,0.650664,0.004084,0.441749,-0.176453,-0.199485,0.243304,0.131103,107.00
11996,1.219641,0.585286,-0.468344,0.809083,0.037295,-1.258920,0.287634,-0.163628,-0.386160,-0.533462,2.013398,0.421080,-0.569673,-1.016487,0.292416,0.704630,0.880818,0.739075,-0.151065,-0.088904,-0.092979,-0.265359,-0.066308,0.413849,0.501237,0.340027,-0.036594,0.033989,0.76
11997,-0.558140,0.841809,1.280613,-0.064192,0.570490,0.846411,0.322914,0.566079,-0.271254,-0.674845,0.852254,-0.179693,-1.106053,-0.071435,2.079568,-0.926613,1.393080,-2.053627,-1.758542,-0.094821,-0.132919,-0.223064,0.175462,-0.717897,-0.554185,0.183793,0.308964,0.084784,7.23
11998,1.261716,-0.943690,0.932851,-0.762985,-1.463898,0.015203,-1.324983,0.331541,-0.355954,0.786528,1.255096,-0.936537,-1.813099,0.182133,0.888933,1.479504,0.199568,-0.556509,0.163954,-0.033567,0.445385,1.111946,-0.073041,0.006358,0.250942,-0.031340,0.034575,0.010134,24.99


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(mini_train.iloc[:,:-1], mini_train["Class"], test_size=2000, random_state=42)
X_test, y_test = test.iloc[:,:-1], test["Class"]
print(X_train.shape, X_valid.shape, X_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

(10000, 29) (2000, 29) (946, 29)
(10000,) (2000,) (946,)


### Scaling

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_valid = scaler.transform(X_valid)
scaled_X_test = scaler.transform(X_test)

# Save processed data

In [25]:
pwd = os.getcwd()
save_file = pwd + "\\save_file"
if not os.path.exists(save_file):
    os.mkdir(save_file)
    
def save_npy_files(path, **arrays):
    for name, array in arrays.items():
        np.save(os.path.join(path, f"{name}.npy"), array)
        

In [26]:
save_npy_files(save_file, X_train=scaled_X_train, X_valid=scaled_X_valid, X_test=scaled_X_test, y_train=y_train, y_valid=y_valid, y_test=y_test)