In [17]:
import pickle

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.base import clone, BaseEstimator

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, precision_recall_curve, roc_curve, roc_auc_score

import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit

In [8]:
with open('../data_processed/X.npy', 'rb') as f:
    X = np.load(f)
with open('../data_processed/y.npy', 'rb') as f:
    y = np.load(f)


In [18]:
def train_test_split(X, y):
  split_test_threshold = 0.2
  selection_iter = StratifiedShuffleSplit(n_splits=1, test_size=split_test_threshold, random_state=42)

  train_index, test_index = next(selection_iter.split(X, y))

  X_train = X[train_index]
  X_test = X[test_index]
  y_train = y[train_index]
  y_test = y[test_index]

  return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
sgd_clf = SGDClassifier(random_state=42)

In [13]:
def my_cross_val_score(clf, X: pd.array, y: pd.array):
    skfolds = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

    for train_index, test_index in skfolds.split(X, y):
        clone_cfl = clone(clf)
        X_train_folds = X[train_index]
        y_train_folds = y[train_index]
        X_test_folds = X[test_index]
        y_test_folds = y[test_index]

        clone_cfl.fit(X_train_folds, y_train_folds)
        y_pred = clone_cfl.predict(X_test_folds)
        n_correct = sum(y_pred == y_test_folds)
        print((n_correct/len(y_pred)))

In [14]:
my_cross_val_score(sgd_clf, X, y)

0.9914113277623027
0.9900701537191788
0.990457030847003


In [16]:
cross_val_score(sgd_clf, X, y, cv=3, scoring='accuracy')

array([0.98718147, 0.98978644, 0.98883215])

In [19]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

In [20]:
confusion_matrix(y_train, y_train_pred)

array([[80186,    93],
       [ 1262, 11511]])

# Precision and Recall and F1

In [21]:
precision_score(y_train, y_train_pred)

0.9919855222337125

In [22]:
recall_score(y_train, y_train_pred)

0.9011978391920458

In [23]:
f1_score(y_train, y_train_pred)

0.9444148172457645

# Using another model - RF

In [24]:
# Initialize the RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)

# Train the RandomForestClassifier
rf_clf.fit(X_train, y_train)

# Predict using the RandomForestClassifier
y_train_pred_rf = cross_val_predict(rf_clf, X_train, y_train, cv=3)

# Evaluate the RandomForestClassifier
rf_confusion_matrix = confusion_matrix(y_train, y_train_pred_rf)
rf_precision = precision_score(y_train, y_train_pred_rf)
rf_recall = recall_score(y_train, y_train_pred_rf)
rf_f1 = f1_score(y_train, y_train_pred_rf)

In [27]:
print("RandomForestClassifier Performance:")
print("Confusion Matrix:\n", rf_confusion_matrix)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)

RandomForestClassifier Performance:
Confusion Matrix:
 [[80227    52]
 [  362 12411]]
Precision: 0.9958276498435369
Recall: 0.9716589681359117
F1 Score: 0.9835948644793152


In [44]:
rf_clf.predict([X_test[0]])

array([False])