In [118]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [119]:
train = pd.read_csv(r'.\data\train.csv')
test = pd.read_csv(r'.\data\test.csv')

In [120]:
train.info()
train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219129 entries, 0 to 219128
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      219129 non-null  int64  
 1   Time    219129 non-null  float64
 2   V1      219129 non-null  float64
 3   V2      219129 non-null  float64
 4   V3      219129 non-null  float64
 5   V4      219129 non-null  float64
 6   V5      219129 non-null  float64
 7   V6      219129 non-null  float64
 8   V7      219129 non-null  float64
 9   V8      219129 non-null  float64
 10  V9      219129 non-null  float64
 11  V10     219129 non-null  float64
 12  V11     219129 non-null  float64
 13  V12     219129 non-null  float64
 14  V13     219129 non-null  float64
 15  V14     219129 non-null  float64
 16  V15     219129 non-null  float64
 17  V16     219129 non-null  float64
 18  V17     219129 non-null  float64
 19  V18     219129 non-null  float64
 20  V19     219129 non-null  float64
 21  V20     21

Unnamed: 0,id,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,...,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0,219129.0
mean,109564.0,62377.415376,0.096008,0.048345,0.592102,0.069273,-0.161555,0.133688,-0.128224,0.149534,...,-0.031064,-0.050852,-0.050531,-0.002992,0.124005,0.009881,0.014034,0.017313,66.359803,0.00214
std,63257.237906,25620.348569,1.395425,1.159805,1.132884,1.253125,1.06953,1.202411,0.817207,0.716212,...,0.422777,0.597812,0.318175,0.5931,0.406741,0.473867,0.233355,0.164859,150.795017,0.046214
min,0.0,0.0,-29.807725,-44.247914,-19.722872,-5.26365,-37.591259,-25.65975,-31.179799,-28.903442,...,-14.689621,-8.748979,-11.958588,-2.836285,-3.958591,-1.858672,-9.234767,-4.55168,0.0,0.0
25%,54782.0,47933.0,-0.846135,-0.573728,-0.027154,-0.769256,-0.847346,-0.631835,-0.64673,-0.095948,...,-0.190418,-0.473099,-0.174478,-0.33254,-0.12608,-0.31833,-0.050983,-0.009512,5.99,0.0
50%,109564.0,63189.0,0.385913,0.046937,0.735895,0.064856,-0.229929,-0.087778,-0.09897,0.111219,...,-0.042858,-0.032856,-0.063307,0.038708,0.145934,-0.086388,0.015905,0.022163,21.9,0.0
75%,164346.0,77519.0,1.190661,0.814145,1.30611,0.919353,0.356856,0.482388,0.385567,0.390976,...,0.109187,0.35491,0.060221,0.394566,0.402926,0.253869,0.076814,0.066987,68.93,0.0
max,219128.0,120580.0,2.430494,16.068473,6.145578,12.547997,34.58126,16.233967,39.824099,18.270586,...,22.062945,6.163541,12.734391,4.572739,3.111624,3.402344,13.123618,23.263746,7475.0,1.0


In [121]:
print(len(train.loc[train['Class'] == 0]))
print(len(train.loc[train['Class'] == 1]))

for col in train.columns.unique():
    missing = sum(train[col].isna())
    if missing !=0:
        print(f"{col} - {missing}")

218660
469


In [122]:
features = []
for i in range(1,29):
    features.append(f"V{i}")

features = ['Time', 'Amount'] + features

In [123]:
X, Y = train[features], train[['Class']]
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8)
scaler = StandardScaler()

for feat in features:
    X_train[feat] = scaler.fit_transform(X_train[feat].to_numpy().reshape(-1,1))
    X_val[feat] = scaler.fit_transform(X_val[feat].to_numpy().reshape(-1,1))
    test[feat] = scaler.fit_transform(test[feat].to_numpy().reshape(-1,1))



def doPCA(data, n_components):
    return PCA(n_components=n_components).fit_transform(data)


def doSMOTE(X,Y, random_state=0):
    smote = SMOTE(random_state=random_state)
    A,B = smote.fit_resample(X, Y)
    return (A,B)


In [124]:
for i in range(1, 28):
    x_train, y_train = X_train, Y_train
    x_val, y_val = X_val, Y_val

    x_train_pca = doPCA(x_train, n_components=i)
    x_val_pca = doPCA(x_val, n_components=i)

    x_t, y_t = doSMOTE(x_train_pca, y_train)
    x_v, y_v = doSMOTE(x_val_pca, y_val)

    dt = DecisionTreeClassifier()

    dt.fit(x_t, y_t)

    y_p = dt.predict(x_v)

    print(f"\nPCA: {i}")
    print(classification_report(y_v, y_p))


PCA: 1
              precision    recall  f1-score   support

           0       0.52      0.62      0.56     43733
           1       0.52      0.41      0.46     43733

    accuracy                           0.52     87466
   macro avg       0.52      0.52      0.51     87466
weighted avg       0.52      0.52      0.51     87466


PCA: 2
              precision    recall  f1-score   support

           0       0.51      0.87      0.65     43733
           1       0.57      0.17      0.26     43733

    accuracy                           0.52     87466
   macro avg       0.54      0.52      0.46     87466
weighted avg       0.54      0.52      0.46     87466


PCA: 3
              precision    recall  f1-score   support

           0       0.50      0.95      0.66     43733
           1       0.55      0.06      0.11     43733

    accuracy                           0.51     87466
   macro avg       0.53      0.51      0.39     87466
weighted avg       0.53      0.51      0.39     87

In [126]:
x_t = doPCA(X_train, n_components=18)

x_t, y_t = doSMOTE(x_t, Y_train)

x_ = doPCA(test[features], 18)

dt = DecisionTreeClassifier()

dt.fit(x_t, y_t)

y_p = dt.predict(x_)

In [138]:
y_pp = pd.DataFrame()
y_pp['id'] = test['id']
y_pp['Class'] = y_p
y_pp.to_csv('attempt1.csv', index=False)

print(len(y_pp))
print(sum(y_p))

146087
4688
