In [None]:
# Importing necessary lebraries
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as pl
import joblib
import seaborn as se
from collections import Counter
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
sys.path.append(os.path.join(os.path.abspath('..')))
# Import modules
from src import data_loading as dl

In [None]:
sys.path.append(os.path.abspath("../"))


from scripts.model import prepare_data, split_data

from scripts.logger import logger 

In [None]:
fraud_df = dl.load_data("processed/processed_fraud_data.csv")


In [None]:
print('No frauds', round(fraud_df['class'].value_counts()[0]/len(fraud_df) * 100,2),
      '% of the dataset')
print('Frauds', round(fraud_df['class'].value_counts()[1]/len(fraud_df) * 100,2),
      '% of the dataset')

In [None]:
X_fraud, y_fraud = prepare_data(fraud_df, 'class')


In [None]:
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = split_data(X_fraud, y_fraud)


In [None]:
print("Fraud Data Shapes:")
print("X_train_fraud:", X_train_fraud.shape)
print("X_test_fraud:", X_test_fraud.shape)
print("y_train_fraud:", y_train_fraud.shape)
print("y_test_fraud:", y_test_fraud.shape)

In [None]:
# Check the distribution of labels in the training and testing sets
train_unique_label, train_counts_label = np.unique(y_train_fraud, return_counts=True)
test_unique_label, test_counts_label = np.unique(y_test_fraud, return_counts=True)


print('Label Distributions: \n')
print('Training set label distribution:', train_counts_label/ len(y_train_fraud))
print('Testing set label distribution:', test_counts_label/ len(y_test_fraud))

In [None]:
non_fraud_sum = fraud_df[fraud_df['class'] == 0].value_counts().sum()
print('non fraud data ', non_fraud_sum)

In [None]:
fraud_sum = fraud_df[fraud_df['class'] == 1].value_counts().sum()
print(' fraud data ', fraud_sum)

In [None]:
se.countplot(data=fraud_df, x='class', hue='class', palette='viridis')
pl.legend(title='class', labels=['Non-Fraud', 'Fraud'], loc='best')
pl.title('Distribution of Fraud and Non-Fraud Transactions')
pl.show()

In [None]:
# Shuffle dataset
df = fraud_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Get counts of each class
num_fraud = df[df['class'] == 1].shape[0]
num_non_fraud = df[df['class'] == 0].shape[0]

# Determine the smaller class size to balance
min_samples = min(num_fraud, num_non_fraud)

# Undersample both classes to be exactly equal
fraud_sample = df[df['class'] == 1].sample(n=min_samples, random_state=42)
non_fraud_sample = df[df['class'] == 0].sample(n=min_samples, random_state=42)

# Combine balanced data
balanced_df = pd.concat([fraud_sample, non_fraud_sample])

# Shuffle again to mix fraud & non-fraud instances
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop unnecessary columns
balanced_df = balanced_df.drop(columns=['Unnamed: 0', 'signup_time', 'purchase_time', 'device_id', 'ip_address'])

# Convert boolean columns to integers
bool_cols = balanced_df.select_dtypes(include=['bool']).columns
balanced_df[bool_cols] = balanced_df[bool_cols].astype(int)

# Verify the class distribution
print(balanced_df['class'].value_counts())

# Display first few rows
balanced_df.head()

In [None]:
se.countplot(data=balanced_df, x='class', hue='class', palette='viridis')
pl.legend(title='class', labels=['Non-Fraud', 'Fraud'], loc='best')
pl.title('Distribution of Fraud and Non-Fraud Transactions')
pl.show()

In [None]:
X_fraud, y_fraud = prepare_data(balanced_df, 'class')


In [None]:
X_train = X_train_fraud.values
X_test = X_test_fraud.values
y_train = y_train_fraud.values
X_test = X_test_fraud.values
y_test = y_test_fraud.values

In [None]:
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = split_data(X_fraud, y_fraud)


In [None]:
X_train_fraud = X_train_fraud.astype('float64')
X_test_fraud = X_test_fraud.astype('float64')

In [None]:
# Train and evaluate Logistic Regression model for Fraud_Data.csv
with mlflow.start_run(run_name="Logistic Regression - Fraud Data"):
    logistic_model = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')  # Increased max_iter & balanced class weights
    
    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train_fraud = X_train_fraud.astype('float64')
    X_test_fraud = X_test_fraud.astype('float64')

    logistic_model.fit(X_train_fraud, y_train_fraud)
    y_pred_fraud = logistic_model.predict(X_test_fraud)

    # Generate classification report
    report_fraud = classification_report(y_test_fraud, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(logistic_model, "logistic_model_fraud", input_example=X_test_fraud[:5])

    # Print classification report
    print("Logistic Regression - Fraud Data:\n", classification_report(y_test_fraud, y_pred_fraud))

In [None]:
# Train and evaluate Decision Tree model for Fraud_Data.csv
with mlflow.start_run(run_name="Decision Tree - Fraud Data"):
    decision_tree_model = DecisionTreeClassifier()
    
    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train_fraud = X_train_fraud.astype('float64')
    X_test_fraud = X_test_fraud.astype('float64')
    
    # Convert integer columns in X_test_fraud to float64 to avoid schema enforcement warnings
    X_test_fraud = X_test_fraud.astype('float64')
    
    decision_tree_model.fit(X_train_fraud, y_train_fraud)
    y_pred_fraud = decision_tree_model.predict(X_test_fraud)

    # Generate classification report
    report_fraud = classification_report(y_test_fraud, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Decision Tree")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(decision_tree_model, "decision_tree_model_fraud", input_example=X_test_fraud[:5])

    # Print classification report
    print("Decision Tree - Fraud Data:\n", classification_report(y_test_fraud, y_pred_fraud))

In [None]:
# Train and evaluate Random Forest model
with mlflow.start_run(run_name="Random Forest - Fraud Data"):
    random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    random_forest_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = random_forest_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Random Forest")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(random_forest_model, "random_forest_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Random Forest - Fraud Data:\n", classification_report(y_test, y_pred_fraud))


In [None]:
# Train and evaluate Gradient Boosting model for Fraud_Data.csv
with mlflow.start_run(run_name="Gradient Boosting - Fraud Data"):
    gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    
    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train_fraud = X_train_fraud.astype('float64')
    X_test_fraud = X_test_fraud.astype('float64')

    gradient_boosting_model.fit(X_train_fraud, y_train_fraud)
    y_pred_fraud = gradient_boosting_model.predict(X_test_fraud)

    # Generate classification report
    report_fraud = classification_report(y_test_fraud, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Gradient Boosting")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(gradient_boosting_model, "gradient_boosting_model_fraud", input_example=X_test_fraud[:5])

    # Print classification report
    print("Gradient Boosting - Fraud Data:\n", classification_report(y_test_fraud, y_pred_fraud))

In [None]:
# Train and evaluate MLP model for Fraud_Data.csv
with mlflow.start_run(run_name="MLP - Fraud Data"):
    mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=0.0001, solver='adam', random_state=42)
    
    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train_fraud = X_train_fraud.astype('float64')
    X_test_fraud = X_test_fraud.astype('float64')

    mlp_model.fit(X_train_fraud, y_train_fraud)
    y_pred_fraud = mlp_model.predict(X_test_fraud)

    # Generate classification report
    report_fraud = classification_report(y_test_fraud, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "MLP")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(mlp_model, "mlp_model_fraud", input_example=X_test_fraud[:5])

    # Print classification report
    print("MLP - Fraud Data:\n", classification_report(y_test_fraud, y_pred_fraud))

In [None]:
fraud_df = fraud_df.sample(frac=1, random_state=42).reset_index(drop=True)
fraud_df = fraud_df.drop(columns=['Unnamed: 0', 'signup_time', 'purchase_time', 'device_id', 'ip_address'])
bool_cols = fraud_df.select_dtypes(include=['bool']).columns
fraud_df[bool_cols] = fraud_df[bool_cols].astype(int)

In [None]:
# Split data into features (X) and target (y)
X_u_o = fraud_df.drop(columns=['class'])
y_u_o = fraud_df['class']

# Split the original data into training and test sets
X_train, X_test, y_train, y_test = split_data(X_u_o, y_u_o, test_size=0.2, random_state=42)

# Convert Date-Time Column to Numerical Features if necessary





# Convert the training and test sets to 'float64' data type
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

In [None]:
# Define the resampling strategies
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.2)

# Split data into features (X) and target (y)
# Split data into features (X) and target (y)
X_u_o = fraud_df.drop(columns=['class'])
y_u_o = fraud_df['class']

# Apply under-sampling first
X_under, y_under = under.fit_resample(X_u_o, y_u_o)

# Apply over-sampling on the under-sampled data
X_resampled, y_resampled = over.fit_resample(X_under, y_under)

# Now you can use the resampled data for further processing
print(Counter(y_resampled))

In [None]:
X_train_fraud = X_train_fraud.astype('float64')
X_test_fraud = X_test_fraud.astype('float64')

In [None]:
# Train and evaluate Logistic Regression model
with mlflow.start_run(run_name="Logistic Regression - Fraud Data"):
    logistic_model = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')

    logistic_model.fit(X_resampled, y_resampled)  # Train on resampled training data

    y_pred_fraud = logistic_model.predict(X_test)  # Predict on original test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(logistic_model, "logistic_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Logistic Regression - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Train and evaluate Decision Tree model
with mlflow.start_run(run_name="Decision Tree - Fraud Data"):
    decision_tree_model = DecisionTreeClassifier()

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    # Convert integer columns in X_test to float64 to avoid schema enforcement warnings
    X_test = X_test.astype('float64')

    decision_tree_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = decision_tree_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Decision Tree")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(decision_tree_model, "decision_tree_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Decision Tree - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Train and evaluate Random Forest model for Fraud_Data.csv
with mlflow.start_run(run_name="Random Forest - Fraud Data"):
    random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train_fraud = X_train_fraud.astype('float64')
    X_test_fraud = X_test_fraud.astype('float64')

    random_forest_model.fit(X_train_fraud, y_train_fraud)
    y_pred_fraud = random_forest_model.predict(X_test_fraud)

    # Generate classification report
    report_fraud = classification_report(y_test_fraud, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Random Forest")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(random_forest_model, "random_forest_model_fraud", input_example=X_test_fraud[:5])

    # Print classification report
    print("Random Forest - Fraud Data:\n", classification_report(y_test_fraud, y_pred_fraud))

In [None]:
# Train and evaluate Gradient Boosting model
with mlflow.start_run(run_name="Gradient Boosting - Fraud Data"):
    gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    gradient_boosting_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = gradient_boosting_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Gradient Boosting")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(gradient_boosting_model, "gradient_boosting_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Gradient Boosting - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Train and evaluate MLP model
with mlflow.start_run(run_name="MLP - Fraud Data"):
    mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=0.0001, solver='adam', random_state=42)

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    mlp_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = mlp_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "MLP")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(mlp_model, "mlp_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("MLP - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Define the path to save the model
model_folder = "models"
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

model_path = os.path.join(model_folder, "random_forest_model_fraud.pkl")

# Save the model
joblib.dump(random_forest_model, model_path)

print(f"Model saved at: {model_path}")