In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import mlflow

In [2]:
# Load the data from Fraud_Data.csv
fraud_data = pd.read_csv('../CleanedData/merged_Dataset.csv')

# Drop columns that are not important
fraud_data.drop(['user_id', 'signup_time', 'purchase_time', 'device_id', 'device_id_info', 'lower_bound_ip_address', 'upper_bound_ip_address', 'country'], axis=1, inplace=True)

# Encode categorical variables
fraud_data = pd.get_dummies(fraud_data, columns=['source', 'browser', 'sex'])


In [3]:
# Convert 'signup_purchase_duration' to numerical format
fraud_data['signup_purchase_duration'] = pd.to_timedelta(fraud_data['signup_purchase_duration']).dt.total_seconds()

# Separate features and target variable
X = fraud_data.drop('class', axis=1)
y = fraud_data['class']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Start MLflow experiment
mlflow.set_experiment("Fraud Detection")

2024/06/21 16:51:16 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/huawei/Desktop/Week8/Adey-Innovations-Inc/Task2/mlruns/772051588966488796', creation_time=1718977876647, experiment_id='772051588966488796', last_update_time=1718977876647, lifecycle_stage='active', name='Fraud Detection', tags={}>

In [5]:
# Decision Tree model training
with mlflow.start_run(run_name="Decision Tree"):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    # Log parameters and metrics
    mlflow.log_params(model.get_params())
    mlflow.log_metric("accuracy", accuracy)
    # Log the trained model
    mlflow.sklearn.log_model(model, "DecisionTreeModel")
    accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9048252911813643




In [6]:
# EDA for class imbalance
class_counts = fraud_data['class'].value_counts()
fraud_percentage = class_counts[1] / (class_counts[0] + class_counts[1]) * 100
print("Percentage of fraud cases:", fraud_percentage)


Percentage of fraud cases: 9.367241941602828


In [7]:
# Feature importance plot
with mlflow.start_run(run_name="Feature Importance"):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    feature_importance = model.feature_importances_
    feature_names = X.columns
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature_importance, y=feature_names)
    plt.title('Feature Importance')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.savefig("feature_importance.png")
    plt.close()
    # Log the feature importance plot
    mlflow.log_artifact("feature_importance.png")

In [8]:
# Random Forest model training
with mlflow.start_run(run_name="Random Forest"):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    # Log parameters and metrics
    mlflow.log_params(model.get_params())
    mlflow.log_metric("accuracy", accuracy)
    # Log the trained model
    mlflow.sklearn.log_model(model, "RandomForestModel")
    accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9567054908485857


In [9]:
# Gradient Boosting model training
with mlflow.start_run(run_name="Gradient Boosting"):
    model = GradientBoostingClassifier()
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    # Log parameters and metrics
    mlflow.log_params(model.get_params())
    mlflow.log_metric("accuracy", accuracy)
    # Log the trained model
    mlflow.sklearn.log_model(model, "GradientBoostingModel")
    accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9566722129783694


In [10]:
# MLP model training
with mlflow.start_run(run_name="MLP"):
    model = MLPClassifier()
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    # Log parameters and metrics
    mlflow.log_params(model.get_params())
    mlflow.log_metric("accuracy", accuracy)
    # Log the trained model
    mlflow.sklearn.log_model(model, "MLPModel")
   
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9265557404326124
