# Machine Learning

## Preparation

### Loading the Dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("datasets/creditcard_balanced.csv")

df.head()

Unnamed: 0,V4,V10,V11,V12,V14,V17,Class
0,1.378155,0.090794,-0.5516,-0.617801,-0.311169,0.207971,0
1,0.448154,-0.166974,1.612727,1.065235,-0.143772,-0.114805,0
2,0.37978,0.207643,0.624501,0.066084,-0.165946,1.109969,0
3,-0.863291,-0.054952,-0.226487,0.178228,-0.287924,-0.684093,0
4,0.403034,0.753074,-0.822843,0.538196,-1.11967,-0.237033,0


### Selecting Features and Target

In [3]:
x = df.drop("Class", axis=1)
y = df["Class"]

### Data Split to train and test

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42
)

### Classifiction Metrics

In [6]:
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    # recall_score,
    # precision_score,
    # f1_score,
    confusion_matrix,
    roc_auc_score,
)

# from sklearn.metrics._plot.confusion_matrix import (
#     confusion_matrix,
# )


def print_metrics(y_true, y_pred):
    print("Classification Report: \n", classification_report(y_true, y_pred))
    print("Model Accuracy: ", accuracy_score(y_true, y_pred))
    print("ROC AUC: ", roc_auc_score(y_true, y_pred))
    print("Confusion matrix: \n", confusion_matrix(y_true, y_pred))

### Grid Search for Hyperparameter Tuning

In [7]:
from sklearn.model_selection import GridSearchCV


def grid_search(model, params, scoring="accuracy", cv=5):
    grid = GridSearchCV(model, params, scoring=scoring, cv=cv)
    grid.fit(x_train, y_train)
    print("Best params: ", grid.best_params_)
    return grid.best_estimator_

## Logistic Regression

### Model Training

In [8]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(random_state=42)

model_lr.fit(x_train, y_train)

### Model Prediction & Metrics

In [9]:
y_pred_lr = model_lr.predict(x_test)

print_metrics(y_test, y_pred_lr)

Classification Report: 
               precision    recall  f1-score   support

           0       0.75      0.77      0.76     71199
           1       0.76      0.74      0.75     71179

    accuracy                           0.76    142378
   macro avg       0.76      0.76      0.76    142378
weighted avg       0.76      0.76      0.76    142378

Model Accuracy:  0.7562685246316144
ROC AUC:  0.7562664479474004
Confusion matrix: 
 [[54898 16301]
 [18401 52778]]


## KNN

### Model Training & Hyperparameter Tuning

In [10]:
from sklearn.neighbors import KNeighborsClassifier

params = {"n_neighbors": [3, 5, 7, 9], "p": [1, 2]}

model_knn = grid_search(KNeighborsClassifier(), params)

Best params:  {'n_neighbors': 3, 'p': 2}


### Model Prediction & Metrics

In [11]:
y_pred_knn = model_knn.predict(x_test)

print_metrics(y_test, y_pred_knn)

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     71199
           1       1.00      1.00      1.00     71179

    accuracy                           1.00    142378
   macro avg       1.00      1.00      1.00    142378
weighted avg       1.00      1.00      1.00    142378

Model Accuracy:  0.9981949458483754
ROC AUC:  0.9981951993707776
Confusion matrix: 
 [[70942   257]
 [    0 71179]]


## Decision Tree Classifier

### Model Training & Hyperparameter Tuning

In [12]:
from sklearn.tree import DecisionTreeClassifier

params = {"criterion": ["entropy", "gini"], "max_depth": range(3, 11)}

model_dtc = grid_search(DecisionTreeClassifier(random_state=42), params)

Best params:  {'criterion': 'gini', 'max_depth': 10}


### Model Prediction & Metrics

In [13]:
y_pred_dtc = model_dtc.predict(x_test)

print_metrics(y_test, y_pred_dtc)

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.91      0.95     71199
           1       0.92      0.99      0.95     71179

    accuracy                           0.95    142378
   macro avg       0.95      0.95      0.95    142378
weighted avg       0.95      0.95      0.95    142378

Model Accuracy:  0.9516568570987091
ROC AUC:  0.9516623663453427
Confusion matrix: 
 [[64965  6234]
 [  649 70530]]


## Random Forest

### Model Training & Hyperparameter Tuning

In [14]:
from sklearn.ensemble import RandomForestClassifier

params = {"n_estimators": [370, 375, 380]}

model_rfc = grid_search(RandomForestClassifier(random_state=42), params)

Best params:  {'n_estimators': 380}


### Model Prediction & Metrics

In [15]:
y_pred_rfc = model_rfc.predict(x_test)

print_metrics(y_test, y_pred_rfc)

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     71199
           1       1.00      1.00      1.00     71179

    accuracy                           1.00    142378
   macro avg       1.00      1.00      1.00    142378
weighted avg       1.00      1.00      1.00    142378

Model Accuracy:  0.9997190577195915
ROC AUC:  0.9997190971783311
Confusion matrix: 
 [[71159    40]
 [    0 71179]]


# Exporting Models

In [4]:
import pickle
import joblib

### Logistic Regression

In [6]:
with open("models/logistic_regression_model.pkl", "wb") as file:
    pickle.dump(model_lr, file)

In [7]:
joblib.dump(model_lr, "models/logistic_regression_model.joblib")

['models/logistic_regression_model.joblib']

### KNN

In [8]:
with open("models/knn_model.pkl", "wb") as file:
    pickle.dump(model_knn, file)

In [9]:
joblib.dump(model_knn, "models/knn_model.joblib")

['models/knn_model.joblib']

### Decision Tree Classifier

In [10]:
with open("models/dtc_model.pkl", "wb") as file:
    pickle.dump(model_dtc, file)

In [11]:
joblib.dump(model_dtc, "models/dtc_model.joblib")

['models/dtc_model.joblib']

### Random Forest

In [12]:
with open("models/rfc_model.pkl", "wb") as file:
    pickle.dump(model_rfc, file)

In [13]:
joblib.dump(model_rfc, "models/rfc_model.joblib")

['models/rfc_model.joblib']