In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import mysql.connector
from sklearn.svm import SVC
from datetime import datetime
import matplotlib.pyplot as plt
from mysql.connector import Error
from sklearn.utils import shuffle
from urllib.parse import quote_plus
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

In [2]:
# creation of connection and engine
def connection(host_name, user_name, user_password, dbname):
    connection = None
    user_password = quote_plus(user_password)
    try:
        connection = mysql.connector.connect(host=host_name, user=user_name, passwd = user_password, database  = dbname)
    except Error as e:
        print(f"The error {e} has occured")
    return connection

def engine(host_name, user_name, user_password, dbname, port):
    engine = create_engine(f'mysql+pymysql://{user_name}:{user_password}@{host_name}:{port}/{dbname}')
    return engine

# extraction of transformed bank data
def extract_bank():
    conn= connection("localhost", "root", "Layaldbroot1997", "fraudulent_activities")
    cursor=conn.cursor()
    query="SELECT * FROM fraudulent_activities.bank"
    cursor.execute(query)
    data=cursor.fetchall()
    column_names=[i[0] for i in cursor.description]
    df=pd.DataFrame(data, columns=column_names)
    return df

# extraction of transformed credit card data
def extract_credit_card():
    conn= connection("localhost", "root", "Layaldbroot1997", "fraudulent_activities")
    cursor=conn.cursor()
    query="SELECT * FROM fraudulent_activities.credit_card"
    cursor.execute(query)
    data=cursor.fetchall()
    column_names=[i[0] for i in cursor.description]
    df=pd.DataFrame(data, columns=column_names)
    return df

In [3]:
def process_data():
    dfb = extract_bank()
    dfcc = extract_credit_card()
    
    # drop unnecessary columns from bank data
    cols = ['TransactionID', 'cc_num', 'Location', 'CustomerID', 'gender', 'Address', 'trans_timestamp','Day_of_Week', 'c_zip', 'm_zip']
    new_dfb = dfb.drop(cols, axis = 1)

    # drop unnecessary columns from credit card data
    ccols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'gender', 'lat', 'long', 'city_pop', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'Day_of_Week', 'zip', 'm_zip']
    new_dfcc = dfcc.drop(ccols, axis = 1)

    # renaming columns to join the 2 dataframes
    new_dfb = new_dfb.rename(columns = {
        'Category':'category',
        'TransactionAmount':'amt',
        'MerchantName': 'merchant',
        'CustomerName': 'customer_name',
        'FraudIndicator': 'is_fraud',
        'c_street': 'street',
        'c_city':'city',
        'c_state': 'state',
        'c_zip': 'zip'
    })
    
    # ordered the columns, concatenated the 2 dataframes and shuffled the records
    new_dfb = new_dfb[new_dfcc.columns]
    df = pd.concat([new_dfb, new_dfcc])
    df = shuffle(df).reset_index(drop=True)
    df['dob'] = pd.to_datetime(df['dob'])
    return df

def bin_data(df):
    # calculate the age at the time of the transaction
    df['age'] = df.apply(lambda row: row['trans_year']-row['dob'].year-((row['trans_month'], row['trans_day'])<(row['dob'].month, row['dob'].day)), axis=1)
    
    # binning age
    age_bins = [10, 19, 29, 39, 49, 59, 69, 79, float('inf')]
    age_labels = ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80+']
    df['age_category'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=True)

    # binning amount
    amt_bins = [1, 500, 1000, 2000, 3000, 4000, 5000]
    amt_labels = ['1-500', '501-1000', '1001-2000', '2001-3000', '3001-4000', '4001-5000']
    df['amt_category'] = pd.cut(df['amt'], bins=amt_bins, labels=amt_labels, right=True, include_lowest=True)

    # drop binned attributes
    df = df.drop(['age','dob', 'amt'], axis=1)
    return df

In [4]:
def encode_data(df):
    # encode categorical attributes
    label_encoder = LabelEncoder()
    df['merchant'] = label_encoder.fit_transform(df['merchant'])
    df['category'] = label_encoder.fit_transform(df['category'])
    df['street'] = label_encoder.fit_transform(df['street'])
    df['city'] = label_encoder.fit_transform(df['city'])
    df['state'] = label_encoder.fit_transform(df['state'])
    df['job'] = label_encoder.fit_transform(df['job'])
    df['customer_name'] = label_encoder.fit_transform(df['customer_name'])
    df['m_street'] = label_encoder.fit_transform(df['m_street'])
    df['m_city'] = label_encoder.fit_transform(df['m_city'])
    df['m_state'] = label_encoder.fit_transform(df['m_state'])
    df['age_category'] = label_encoder.fit_transform(df['age_category'])
    df['amt_category'] = label_encoder.fit_transform(df['amt_category'])
    return df

In [5]:
def evaluate_classification_models():
    # get the data
    df = process_data()

    # binning data
    df = bin_data(df)

    # encode categorical attributes
    df = encode_data(df)
    
    # split features from target labels
    x = df.drop(['is_fraud'], axis=1)
    y = df['is_fraud']
    # split the data into training and testing sets and scale it
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    # dictionary of classification models
    models = {
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Logistic regression classifier": LogisticRegression(solver='liblinear',max_iter=500),
        "Random Forest Classifier": RandomForestClassifier(),
        "Support Vector Machine (SVM)": SVC(),
        "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier()
    }
    results = {}
    for model_name, model in models.items():
        # fit modeel to data
        model.fit(x_train_scaled, y_train)
        
        # predict using trained model
        y_pred = model.predict(x_test_scaled)
        
        # calculate the metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC (Receiver Operating Characteristic - Area Under the Curve)": roc_auc
        }
    
    return results

In [6]:
def best_model_tuning_and_evaluation():
    # get the data
    df = process_data()

    # binning data
    df = bin_data(df)

    # encode categorical attributes
    df = encode_data(df)
    
    # split features from target labels
    x = df.drop(['is_fraud'], axis=1)
    y = df['is_fraud']
    model = RandomForestClassifier(random_state=42)

    # the range of hyperparameters to search
    param_grid = {
        'n_estimators': [50, 100, 150],  # nb of trees in the forest
        'max_depth': [None, 10, 20, 30],  # max depth of the trees
        'min_samples_split': [2, 5, 10],  # min  samples to split an internal node
        'min_samples_leaf': [1, 2, 4],  # min samples to be at a leaf node
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
    
    # split the data into training and testing sets and scale it
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    # Fit the grid search to the resampled data
    grid_search.fit(x_train_scaled, y_train)
    
    # Get the best hyperparameters and corresponding model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    best_model.fit(x_train_scaled, y_train)
    
    # Make predictions on the testing data
    y_pred = best_model.predict(x_test_scaled)
    
    # Calculate and print various metrics to evaluate the best model's performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cm_normalized = cm.astype('float') / 2000
    
    print("Best Model Evaluation Metrics:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("ROC AUC (Receiver Operating Characteristic - Area Under the Curve):", roc_auc)
    print("Confusion Matrix:")
    print(cm)

    plt.figure(figsize=(6,4))
    sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

In [8]:
def model_prediction(value):
    # get the data
    df = process_data()

    # spit features from target attribute
    y = df['is_fraud']
    x = df.drop(['is_fraud'], axis=1)

    # add record to dataframe
    df_value = pd.DataFrame([value])
    new_df = pd.concat([x, df_value], ignore_index=True)

    # bin data
    new_df = bin_data(new_df)

    # encode data
    df_encoded = encode_data(new_df)

    # retreive record after encoding it
    encoded_value = df_encoded.iloc[-1:]
    df_encoded = df_encoded.iloc[:-1]

    # initialize model by optimal parameters
    model = RandomForestClassifier(max_depth = 30, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 100, random_state=42)
    
    # split the data into training and testing sets and scale it
    x_train, x_test, y_train, y_test = train_test_split(df_encoded, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    model.fit(x_train_scaled, y_train)

    # scale encoded value to be predicted
    encoded_value_scaled = scaler.transform(encoded_value)
    prediction = model.predict(encoded_value_scaled)
    return prediction