In [12]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import classification_report

import pandas as pd

# Machine Learning approach

## Read the dataset

In [3]:
train_dataset = pd.read_csv('data/train_dataset.csv')
test_dataset = pd.read_csv('data/test_dataset.csv')

In [17]:
data_columns = ["V"+str(i) for i in range(1,29)]+["Amount"]

label_column ="Class"

In [18]:
X_train = train_dataset[data_columns]
X_test  = test_dataset[data_columns]

y_train = train_dataset[label_column]
y_test  = test_dataset[label_column]

## Simple Logistic + GridCV

In [19]:
lr_model = LogisticRegression()
params = {
    'tol':[1e-4, 1e-5],
    'C':[1e-2,1e-1,1]
}
lr_gridcv_model = GridSearchCV(lr_model, params)

In [21]:
lr_gridcv_model.fit(X_train,y_train)

In [22]:
lr_gridcv_model.best_estimator_

In [24]:
y_pred = lr_gridcv_model.predict(X_test)
report = classification_report(y_pred,y_test)
print(report)

              precision    recall  f1-score   support

         0.0       0.98      0.95      0.97     58179
         1.0       0.95      0.98      0.96     55547

    accuracy                           0.97    113726
   macro avg       0.97      0.97      0.97    113726
weighted avg       0.97      0.97      0.97    113726



## PCA + Logistic + GridCV
If dataset have more dimension :) . Just imagine this dataset is very large

In [49]:
pca = PCA(n_components=20)
pca.fit(X_train)

In [50]:
pca_train = pca.transform(X_train)
pca_test  = pca.transform(X_test)

In [51]:
lr_model = LogisticRegression()
params = {
    'tol':[1e-4, 1e-5],
    'C':[1e-2,1e-1,1]
}
lr_pca_gridcv_model = GridSearchCV(lr_model, params)
lr_pca_gridcv_model.fit(pca_train,y_train)

In [52]:
y_pred = lr_pca_gridcv_model.predict(pca_test)
report = classification_report(y_pred,y_test)
print(report)

              precision    recall  f1-score   support

         0.0       0.98      0.95      0.96     58633
         1.0       0.94      0.98      0.96     55093

    accuracy                           0.96    113726
   macro avg       0.96      0.96      0.96    113726
weighted avg       0.96      0.96      0.96    113726



If the dataset is larger, this appoarch might be very good.

## XGBoost or LightGBM
These model require lots of computation. Try this if your PC is good.

### XGBoost

In [None]:
import torch
device = 'cuda' if torch.cuda.is_avaliable() else 'cpu'

In [54]:
xgb_core = xgb.XGBClassifier(device = device)
params = {
    'n_estimators':[100,200,300],
    'learning_rate':[0.01,0.05,0.1,0.2],
    'max_depth':[4,5,6]
}
xgb_class = RandomizedSearchCV(xgb_core,param_grid=params)

In [None]:
xgb_class.fit(X_train,y_train)

In [None]:
xgb_class.best_estimator_

In [None]:
y_pred = xgb_class.predict(X_test)
report = classification_report(y_pred,y_test)
print(report)

### LightGBM

In [None]:
lightgbm = lgb.LGBMClassifier()
params = {
    'n_estimators':[100,200,300],
    'learning_rate':[0.01,0.05,0.1,0.2],
    'max_depth':[4,5,6],
    'num_leaves':[31,48]
}
lightgbm_class = RandomizedSearchCV(lightgbm,params)

In [None]:
lightgbm_class.best_estimator_

In [None]:
y_pred = lightgbm_class.predict(X_test)
report = classification_report(y_pred,y_test)
print(report)