# model training juypter notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys

In [2]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.20.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.2 (from mlflow)
  Downloading mlflow_skinny-2.20.2-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.2->mlflow)
  Downloading databricks_sdk-0.44.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv2D, Flatten

class ModelTrainer:
    def __init__(self, df, target_column, test_size=0.2, random_state=42):
        """
        Initialize the ModelTrainer with dataset and parameters.

        Parameters:
        - df: pd.DataFrame - The input dataset containing features and the target column.
        - target_column: str - The name of the target column.
        - test_size: float - Proportion of the dataset to include in the test split.
        - random_state: int - Seed for random number generator.
        """
        self.df = df
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state
        self.X_train, self.X_test, self.y_train, self.y_test = self.prepare_data()

    def prepare_data(self):
        """Prepare the data by splitting it into training and testing sets."""
        X = self.df.drop(columns=[self.target_column])
        y = self.df[self.target_column]
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_logistic_regression(self):
        """Train and evaluate Logistic Regression model."""
        model = LogisticRegression()
        return self.train_and_evaluate(model, "Logistic Regression")

    def train_decision_tree(self):
        """Train and evaluate Decision Tree model."""
        model = DecisionTreeClassifier()
        return self.train_and_evaluate(model, "Decision Tree")

    def train_random_forest(self):
        """Train and evaluate Random Forest model."""
        model = RandomForestClassifier()
        return self.train_and_evaluate(model, "Random Forest")

    def train_gradient_boosting(self):
        """Train and evaluate Gradient Boosting model."""
        model = GradientBoostingClassifier()
        return self.train_and_evaluate(model, "Gradient Boosting")

    def train_mlp(self):
        """Train and evaluate Multi-Layer Perceptron (MLP) model."""
        mlp_model = Sequential([
            Dense(64, activation='relu', input_shape=(self.X_train.shape[1],)),
            Dropout(0.5),
            Dense(32, activation='relu'),
            Dense(1, activation='sigmoid')  # Change activation for multi-class
        ])

        mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        with mlflow.start_run():
            mlp_model.fit(self.X_train, self.y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=1)
            mlp_loss, mlp_accuracy = mlp_model.evaluate(self.X_test, self.y_test, verbose=0)
            mlflow.log_param("model_name", "Multi-Layer Perceptron")
            mlflow.log_metric("accuracy", mlp_accuracy)
            print(f"Multi-Layer Perceptron Accuracy: {mlp_accuracy:.4f}")
            return mlp_accuracy

    def train_and_evaluate(self, model, model_name):
        """Train the model and evaluate its performance."""
        with mlflow.start_run():
            model.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)
            accuracy = accuracy_score(self.y_test, y_pred)
            mlflow.log_param("model_name", model_name)
            mlflow.log_metric("accuracy", accuracy)
            print(f"{model_name} Accuracy: {accuracy:.4f}")
            return accuracy



In [6]:
df_fraud = pd.read_csv('/content/fraud_data_final.csv')

In [7]:
df_fraud.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,ip_address_int,transaction_velocity_hours,hour_of_day,day_of_week,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.160204,QVPSPJUOCKZAR,0.679914,43.173.1.96,0,-1.13688,-0.136057,-1.377455,0.99102,False,True,False,False,False,False,True
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142592,EOGFQPIZPYXFZ,2.304476,20.225.83.219,0,-1.443207,-1.571877,-1.522122,-1.501259,False,False,False,False,False,False,False
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.197169,YSSKYOSJHPPLJ,2.304476,156.64.132.28,1,0.375916,-1.577617,0.937208,-0.005891,False,True,False,False,True,False,True
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385567,ATGTXKYKUDUQN,0.911994,228.234.6.235,0,1.352348,-1.420213,0.213876,-1.501259,False,True,False,False,False,True,True
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,0.112681,NAUITBZFJKHWW,1.376155,24.197.75.141,0,-1.390927,-0.182509,0.937208,-0.504347,False,False,False,False,False,True,True


In [8]:
df_merged = pd.read_csv('/content/fraud_ip_to_country_merged_.csv')

In [9]:
df_merged.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_address_int,lower_bound_ip_address,upper_bound_ip_address,country,lower_bound_ip_address_int,upper_bound_ip_address_int
0,247547,2015-06-28 03:00:34,2015-08-09 03:57:29,47,KIXYSVCHIPQBR,SEO,Safari,F,30,1.0.6.112,0,16778864,1.0.4.0,1.0.7.255,Australia,16778240.0,16779263.0
1,220737,2015-01-28 14:21:11,2015-02-11 20:28:28,15,PKYOWQKWGJNJI,SEO,Chrome,F,34,1.0.253.61,0,16842045,1.0.128.0,1.0.255.255,Thailand,16809984.0,16842751.0
2,390400,2015-03-19 20:49:09,2015-04-11 23:41:23,44,LVCSXLISZHVUO,Ads,IE,M,29,1.1.3.136,0,16843656,1.1.2.0,1.1.3.255,China,16843264.0,16843775.0
3,69592,2015-02-24 06:11:57,2015-05-23 16:40:14,55,UHAUHNXXUADJE,Direct,Chrome,F,30,1.2.118.236,0,16938732,1.2.64.0,1.2.127.255,China,16924672.0,16941055.0
4,174987,2015-07-07 12:58:11,2015-11-03 04:04:30,51,XPGPMOHIDRMGE,SEO,Chrome,F,37,1.2.248.208,0,16971984,1.2.128.0,1.2.255.255,Thailand,16941056.0,16973823.0


In [10]:
df_for_model = df_fraud.merge(df_merged[['user_id', 'country']], on='user_id', how='left')
df_for_model.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,ip_address_int,transaction_velocity_hours,hour_of_day,day_of_week,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.160204,QVPSPJUOCKZAR,0.679914,43.173.1.96,0,-1.13688,-0.136057,-1.377455,0.99102,False,True,False,False,False,False,True,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142592,EOGFQPIZPYXFZ,2.304476,20.225.83.219,0,-1.443207,-1.571877,-1.522122,-1.501259,False,False,False,False,False,False,False,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.197169,YSSKYOSJHPPLJ,2.304476,156.64.132.28,1,0.375916,-1.577617,0.937208,-0.005891,False,True,False,False,True,False,True,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385567,ATGTXKYKUDUQN,0.911994,228.234.6.235,0,1.352348,-1.420213,0.213876,-1.501259,False,True,False,False,False,True,True,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,0.112681,NAUITBZFJKHWW,1.376155,24.197.75.141,0,-1.390927,-0.182509,0.937208,-0.504347,False,False,False,False,False,True,True,United States


In [11]:
df_for_model.drop(['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address'], axis=1, inplace=True)

In [12]:
df_for_model['country'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_for_model['country'].fillna('Unknown', inplace=True)


In [13]:
# Calculate the frequency of each country
country_freq = df_for_model['country'].value_counts()

# Map the frequencies to the country column
df_for_model['country_freq'] = df_for_model['country'].map(country_freq)

df_for_model.head()

Unnamed: 0,purchase_value,age,class,ip_address_int,transaction_velocity_hours,hour_of_day,day_of_week,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M,country,country_freq
0,-0.160204,0.679914,0,-1.13688,-0.136057,-1.377455,0.99102,False,True,False,False,False,False,True,Japan,7306
1,-1.142592,2.304476,0,-1.443207,-1.571877,-1.522122,-1.501259,False,False,False,False,False,False,False,United States,58049
2,-1.197169,2.304476,1,0.375916,-1.577617,0.937208,-0.005891,False,True,False,False,True,False,True,United States,58049
3,0.385567,0.911994,0,1.352348,-1.420213,0.213876,-1.501259,False,True,False,False,False,True,True,Unknown,21966
4,0.112681,1.376155,0,-1.390927,-0.182509,0.937208,-0.504347,False,False,False,False,False,True,True,United States,58049


In [14]:
df_for_model.drop('country', axis=1, inplace=True)

In [15]:
trainer = ModelTrainer(df_for_model, target_column='class')

In [16]:
results = {
     "Logistic Regression": trainer.train_logistic_regression(),
     "Decision Tree": trainer.train_decision_tree(),
     "Random Forest": trainer.train_random_forest(),
     "Gradient Boosting": trainer.train_gradient_boosting(),
     "Multi-Layer Perceptron": trainer.train_mlp()
 }

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.9057
Decision Tree Accuracy: 0.9065
Random Forest Accuracy: 0.9564
Gradient Boosting Accuracy: 0.9564


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m3400/3400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.8280 - loss: 139.0663 - val_accuracy: 0.9062 - val_loss: 0.4590
Epoch 2/10
[1m3400/3400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - accuracy: 0.8888 - loss: 0.6362 - val_accuracy: 0.9062 - val_loss: 0.3106
Epoch 3/10
[1m3400/3400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9070 - loss: 0.3122 - val_accuracy: 0.9062 - val_loss: 0.3111
Epoch 4/10
[1m3400/3400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9060 - loss: 0.3162 - val_accuracy: 0.9062 - val_loss: 0.3106
Epoch 5/10
[1m3400/3400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9075 - loss: 0.3104 - val_accuracy: 0.9062 - val_loss: 0.3104
Epoch 6/10
[1m3400/3400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9059 - loss: 0.3145 - val_accuracy: 0.9062 - val_loss: 0.3103
Epoch 7/10
