In [None]:
# Importing necessary lebraries
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as pl
import joblib
import seaborn as se
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import mlflow
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
sys.path.append(os.path.join(os.path.abspath('..')))
# Import modules
from src import data_loading as dl

In [None]:
sys.path.append(os.path.abspath("../"))


from scripts.model import prepare_data, split_data

from scripts.logger import logger 

In [None]:
credit = dl.load_data("creditcard.csv")
# Remove duplicates
credit = credit.drop_duplicates()
credit["Class"] = credit["Class"].astype(bool)

In [None]:
credit.head()


In [None]:
print('No frauds', round(credit['Class'].value_counts().iloc[0] / len(credit) * 100, 2), '% of the dataset')
print('Frauds', round(credit['Class'].value_counts().iloc[1] / len(credit) * 100, 2), '% of the dataset')

non_fraud_sum = credit[credit['Class'] == 0].value_counts().sum()
print('Non fraud', non_fraud_sum)

fraud_sum = credit[credit['Class'] == 1].value_counts().sum()
print('Fraud', fraud_sum)

In [None]:
se.countplot(data=credit, x='Class', hue='Class', palette='viridis')
pl.legend(title='Class', labels=['Non-Fraud', 'Fraud'], loc='best')
pl.title('Distribution of Fraud and Non-Fraud Transactions')
pl.show()

In [None]:
# to match the number of instances in the minority class.

# Randomly shuffle the entire dataset to ensure randomness
df = credit.sample(frac=1, random_state=42).reset_index(drop=True)

# Isolate the minority class (fraud) with 473 instances
minority_class = df.loc[df['Class'] == 1]

# Select a random subset of the majority class (non-fraud) with the same number of instances as the minority class
majority_class_subset = df.loc[df['Class'] == 0].sample(n=473, random_state=42)

# Combine the minority class and the majority class subset to create a balanced dataset
balanced_df = pd.concat([minority_class, majority_class_subset])

# Shuffle the balanced dataset to ensure randomness
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the first few rows of the balanced dataset
balanced_df.head()

In [None]:
se.countplot(data=balanced_df, x='Class', hue='Class', palette='viridis')
pl.legend(title='Class', labels=['Non-Fraud', 'Fraud'], loc='best')
pl.title('Distribution of Fraud and Non-Fraud Transactions')
pl.show()

In [None]:
X_credit, y_credit = prepare_data(balanced_df, 'Class')


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Convert the target variable to integer
credit['Class'] = credit['Class'].astype(int)

# Define the resampling strategies
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.1)

# Split data into features (X) and target (y)
X = credit.iloc[:, :-1].values  # Assuming 'Class' is the last column
y = credit['Class'].values

# Apply under-sampling first
X_under, y_under = under.fit_resample(X, y)

# Apply over-sampling on the under-sampled data
X_resampled, y_resampled = over.fit_resample(X_under, y_under)

# Now you can use the resampled data for further processing
print(Counter(y_resampled))

In [None]:
#  Train-Test Split
X_train, X_test, y_train, y_test = split_data(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Train and evaluate Logistic Regression model for creditcard.csv
with mlflow.start_run(run_name="Logistic Regression - Fraud Data"):
    logistic_model = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')  # Increased max_iter & balanced class weights
    
    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    logistic_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = logistic_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(logistic_model, "logistic_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Logistic Regression - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Train and evaluate Decision Tree model creditcard.csv
with mlflow.start_run(run_name="Decision Tree - Fraud Data"):
    decision_tree_model = DecisionTreeClassifier()

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    decision_tree_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = decision_tree_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Decision Tree")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(decision_tree_model, "decision_tree_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Decision Tree - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Train and evaluate Random Forest model for creditcard.csv
with mlflow.start_run(run_name="Random Forest - Fraud Data"):
    random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    random_forest_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = random_forest_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Random Forest")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(random_forest_model, "random_forest_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Random Forest - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Train and evaluate Gradient Boosting model for creditcard.csv
with mlflow.start_run(run_name="Gradient Boosting - Fraud Data"):
    gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    gradient_boosting_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = gradient_boosting_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "Gradient Boosting")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(gradient_boosting_model, "gradient_boosting_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("Gradient Boosting - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
# Train and evaluate MLP model for creditcard.csv
with mlflow.start_run(run_name="MLP - Fraud Data"):
    mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=0.0001, solver='adam', random_state=42)

    # Ensure feature data is in float64 to avoid MLflow warnings
    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')

    mlp_model.fit(X_train, y_train)  # Train the model
    y_pred_fraud = mlp_model.predict(X_test)  # Predict on test data

    # Generate classification report
    report_fraud = classification_report(y_test, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']

    # Log parameters, metrics, and model
    mlflow.log_param("model", "MLP")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(mlp_model, "mlp_model_fraud", input_example=X_test[:5])

    # Print classification report
    print("MLP - Fraud Data:\n", classification_report(y_test, y_pred_fraud))

In [None]:
import os
import joblib

# Create the 'models' directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save the trained model to the 'models' folder with the name 'randomforestfor_credit_card_data.pkl'
joblib.dump(random_forest_model, 'models/randomforestfor_credit_card_data.pkl')