# Task 2 - Model Building and Training 


**Loading the datasets for modeling**




In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Configure logging
from logger import SetupLogger
# Assuming this class is defined in scripts/
from load_data import LoadData  

logger = SetupLogger(log_file='../logs/notebooks.log').get_logger()


Load the datasets

In [2]:
# Create instances
load_fraud = LoadData('../data/processed_fraud_data.csv', logger=logger)
load_credit = LoadData('../data/creditcard.csv', logger=logger)
# Load the datasets
fraud_data = load_fraud.load_dataset().set_index('user_id')
credit_data = load_credit.load_dataset()



In [3]:
# Explore the few rows
credit_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
fraud_data.head()

Unnamed: 0_level_0,purchase_value,age,class,purchase_delay,hour_of_day,day_of_week,fraud_rate,user_transaction_frequency,device_transaction_frequency,user_transaction_velocity,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
247547,0.549607,-0.363124,0,-0.4138,-1.231124,1.487911,-0.233869,0.0,-0.259874,-0.232945,0.0,1.0,0.0,0.0,0.0,1.0,0.0
220737,-1.197335,0.101168,0,-1.180852,1.229002,-0.505034,-1.259767,0.0,-0.259874,-0.232943,0.0,1.0,0.0,0.0,0.0,0.0,0.0
390400,0.385831,-0.479197,0,-0.936126,1.663142,0.989675,-0.354204,0.0,0.116936,-0.232944,0.0,0.0,0.0,1.0,0.0,0.0,1.0
69592,0.986342,-0.363124,0,0.867086,0.650149,0.989675,-0.354204,0.0,-0.259874,-0.232946,1.0,0.0,0.0,0.0,0.0,0.0,0.0
174987,0.767974,0.449387,0,1.700633,-1.086411,-1.00327,-1.259767,0.0,-0.259874,-0.232946,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Understand the shapes
credit_data.shape, fraud_data.shape

((284807, 31), (129146, 17))

In [6]:
# Check any missing values
print(credit_data.isnull().sum())
print(fraud_data.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
purchase_value                  0
age                             0
class                           0
purchase_delay                  0
hour_of_day                     0
day_of_week                     0
fraud_rate                      0
user_transaction_frequency      0
device_transaction_frequency    0
user_transaction_velocity       0
source_Direct                   0
source_SEO                      0
browser_FireFox                 0
browser_IE                      0
browser_Opera                   0
browser_Safari                  0
sex_M                           0
dtype: int64


**Data Preparation:**

- Feature and Target Separation [‘Class’(creditcard), ‘class’(Fraud_Data)]
Train-Test Split 

For creditcard dataset (target column 'Class'):

In [7]:
from data_preparation import DataPreparation
# Assuming df_creditcard is the DataFrame for the credit card dataset
_creditcard = DataPreparation(credit_data, target_column='Class')
_creditcard.train_test_split(test_size=0.2, random_state=42)

# Retrieving the train and test sets
X_train_cc, X_test_cc, y_train_cc, y_test_cc = _creditcard.get_train_test_data()


Data split into training and testing sets successfully.


For Fraud_Data dataset (target column 'class'):

In [8]:
# Assuming df_fraud is the DataFrame for the fraud dataset
_fraud = DataPreparation(fraud_data, target_column='class')
_fraud.train_test_split(test_size=0.2, random_state=42)

# Retrieving the train and test sets
X_train_fd, X_test_fd, y_train_fd, y_test_fd = _fraud.get_train_test_data()


Data split into training and testing sets successfully.


### Model Selection
- Import ModelPipeline class from model_pipeline
- Train multiple models
- hyperparameter tune
- evaluate the model
- compare the model

In [9]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Disable CUDA

# Import the class
from model_pipeline import ModelPipeline

2024-10-21 14:37:02.631206: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-21 14:37:05.253776: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-21 14:37:06.891666: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-21 14:37:08.810751: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-21 14:37:09.184020: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**Train and Evaluate the models on the e-commerce fruad dataset**


In [10]:
# Create instance of the pipeline
model_pipeline = ModelPipeline(X_train_fd, X_test_fd, y_train_fd, y_test_fd)

# add models
# model_pipeline.add_models()
# Train and evaluate, logging with MLflow
best_model, best_model_name = model_pipeline.train_and_evaluate()
# Save the best model
model_pipeline.save_best_models(best_model, best_model_name, 'fraud')

Tuning hyperparameters for Random Forest...
Random Forest best parameters: {'classifier__max_depth': 5, 'classifier__n_estimators': 100}
Tuning hyperparameters for Gradient Boosting...
Gradient Boosting best parameters: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100}
Random Forest took 7.05 seconds to train


Successfully registered model 'random_forest'.
Created version '1' of model 'random_forest'.


Random Forest model trained and logged with MLflow
Gradient Boosting took 20.44 seconds to train


Successfully registered model 'gradient_boosting'.
Created version '1' of model 'gradient_boosting'.


Gradient Boosting model trained and logged with MLflow
[1m808/808[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step
LSTM took 148.67 seconds to train


Successfully registered model 'lstm'.
Created version '1' of model 'lstm'.


LSTM model trained and logged with MLflow
[1m808/808[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step




CNN took 29.64 seconds to train


Successfully registered model 'cnn'.
Created version '1' of model 'cnn'.


CNN model trained and logged with MLflow
Random Forest best model saved.


In [11]:
# Get the results 
results_fraud, y_probs_fraud = model_pipeline.get_results()


In [12]:
pd.DataFrame(results_fraud).T

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Random Forest,0.956988,1.0,0.539195,0.70062,0.837107
Gradient Boosting,0.956988,1.0,0.539195,0.70062,0.83466
LSTM,0.957027,1.0,0.53961,0.70097,0.779031
CNN,0.956988,1.0,0.539195,0.70062,0.83315


**Train and Evaluate the models on the creditcard dataset**


In [13]:
# Create instance of the pipeline
model_pipeline = ModelPipeline(X_train_cc, X_test_cc, y_train_cc, y_test_cc)

# add models
model_pipeline.add_models()
# Train and evaluate, logging with MLflow
best_model, best_model_name = model_pipeline.train_and_evaluate()
# Save the best model
model_pipeline.save_best_models(best_model, best_model_name, 'creditcard')

Tuning hyperparameters for Random Forest...
Random Forest best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Tuning hyperparameters for Gradient Boosting...
Gradient Boosting best parameters: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100}
Random Forest took 317.20 seconds to train


Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '2' of model 'random_forest'.


Random Forest model trained and logged with MLflow
Gradient Boosting took 519.13 seconds to train


Registered model 'gradient_boosting' already exists. Creating a new version of this model...
Created version '2' of model 'gradient_boosting'.


Gradient Boosting model trained and logged with MLflow


2024-10-21 16:00:03.795937: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 27341400 exceeds 10% of free system memory.


[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step




LSTM took 494.20 seconds to train


Registered model 'lstm' already exists. Creating a new version of this model...
Created version '2' of model 'lstm'.


LSTM model trained and logged with MLflow
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step




CNN took 65.73 seconds to train


Registered model 'cnn' already exists. Creating a new version of this model...
Created version '2' of model 'cnn'.


CNN model trained and logged with MLflow
LSTM best model saved.


In [14]:
# Get the results 
results_creditcard, y_probs_credicard = model_pipeline.get_results()

In [15]:
pd.DataFrame(results_creditcard).T

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Random Forest,0.999579,0.974359,0.77551,0.863636,0.952292
Gradient Boosting,0.999403,0.984848,0.663265,0.792683,0.928369
LSTM,0.999386,0.831579,0.806122,0.818653,0.97345
CNN,0.999245,0.898551,0.632653,0.742515,0.88265


**Note:** Model training and evaluation have been completed, with all processes tracked using MLflow. Please refer to the document for screenshots showcasing the tracked models across different versions.