In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from helper_methods import produce_synthetic_data
from training_methods import k_fold_cross_validation
from logistic_regression import LogisticRegression
from support_vector_machine import SVM
from k_nearest_neighbours import KNN
from majority_vote import MajorityVote

In [None]:
# Reading the dataset from the csv
load_dotenv()
df = pd.read_csv(os.environ['DATASET_PATH'])
df

In [None]:
# Seperating fraudulent and non-fraudulent transactions and get their feature vectors
fraudulent_features = df[df['Class'] == 1].iloc[:, 2:29].to_numpy()
non_fraudulent_features = df[df['Class'] == 0].iloc[:, 2:29].to_numpy()

print(fraudulent_features.shape)
print(non_fraudulent_features.shape)

In [None]:
""" Produce synthetic data of fraudulent, so the models do not overfit on non-fraudulent 
transactions since they dominate the dataset by a large factor
"""
difference_between_fraudulent_and_non = non_fraudulent_features.shape[0] - fraudulent_features.shape[0]
fraudulent_synthetic_features = produce_synthetic_data(
    difference_between_fraudulent_and_non, fraudulent_features)

X = np.concatenate((fraudulent_features, fraudulent_synthetic_features), axis=0)
X = np.concatenate((X, non_fraudulent_features))


Y = np.concatenate((
    np.ones(non_fraudulent_features.shape[0]), np.zeros(non_fraudulent_features.shape[0])), axis=0)


# Reorder randomly the dataset
randomise = np.arange(X.shape[0])
np.random.shuffle(randomise)

X = X[randomise]
Y = Y[randomise]

In [None]:
# Initialise a logistic regression model
logistic_reg = LogisticRegression(X.shape[1])

In [None]:
# Perform k-fold closs validation on the logistic regression model
k_fold_cross_validation(logistic_reg, 5, X, Y, 10, 0.001)

In [None]:
""" Replace class label 0 with -1, so it is compatible 
    with the support vector machine classifier
"""
Y[Y == 0] = -1

In [None]:
# Initialise a support vector machine model
svm = SVM(X.shape[1], 1)

In [None]:
# Perform k-fold closs validation on the logistic regression model
k_fold_cross_validation(svm, 5, X, Y, 10, 0.01, 1, -1, True)

In [None]:
""" Revert class label -1 back to 0, so it is compatible 
    with the rest of the models
"""
Y[Y == -1] = 0

In [None]:
# Initialise a k nearest neighbours model
knn = KNN(5)

In [None]:
# Perform k-fold closs validation on the k nearest neighbours model
k_fold_cross_validation(knn, 5, X[:10000], Y[:10000], 10, 0.002, 1, 0, True)

In [None]:
""" Generating random models
"""
best_model = None
best_accuracy = -np.inf

for i in range(100):
    num_of_models = np.random.randint(1, 6)
    models = []
    features_considered = []
    class_vals = []
    cap_training = []

    for j in range(num_of_models):
        model_idx = np.random.randint(0, 3)
        random_feature_choice = np.random.randint(0, 2, X.shape[1])
        features = random_feature_choice == np.ones(X.shape[1])
        num_features = np.sum(random_feature_choice)

        if model_idx == 0:
            features_considered.append(features)
            class_vals.append((0, 1))
            cap_training.append(X.shape[0])
            models.append(LogisticRegression(num_features))
        elif model_idx == 1:
            features_considered.append(features)
            class_vals.append((-1, 1))
            cap_training.append(X.shape[0])
            models.append(SVM(num_features, 1))
        else:
            features_considered.append(features)
            class_vals.append((0, 1))
            cap_training.append(500)
            models.append(KNN(np.random.randint(3, 11)))
    
    majority_vote = MajorityVote(models, class_vals, cap_training, features_considered)
    acc = k_fold_cross_validation(majority_vote, 5, X, Y, 10, 0.01)

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = majority_vote
    
    print(f'Model #{i}:')
    print(f'Models given: {models}')
    print(f'Overall Accuracy: {acc}')

In [None]:
best_accuracy

In [None]:
best_model.models