In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preprocessing

In [None]:
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("/kaggle/input/credit-card-fraud-detection-dataset-2023/creditcard_2023.csv", usecols=lambda col: col != 'id')
print(data.shape)
pd.set_option('display.max_column', None)
data.head()

In [None]:
# Taking only small portion of data 
print("Actual Datasize : ",data.shape)
data = data.sample(50000)
print("Reduced Datasize : ",data.shape)

data2 = data.copy()

In [None]:
data.isnull().sum()
# Data has no null values

In [None]:
cat_cols = data.select_dtypes(include=['object']).columns
print("Categorical Columns ; ", cat_cols)

int_cols = [col for col in data.columns if col not in cat_cols]
print("Non-categorical Columns ; ", int_cols)

Column names are not present because They are created in a result of PCA. 

In [None]:
data['Class'].value_counts()
# It displays that values are balanced

As our data is now transformed and is ready for Data Visulaization and ML Model Creation. 

# Outliers

In [None]:
def iqr_limits(data, col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    
    iqr = q3 - q1
    lower = q1 - (1.5 * iqr)
    upper = q3 + (1.5 * iqr)
    return upper, lower

In [None]:
skew, not_skew = [], []

X = data.drop('Class', axis=1)
Y = data['Class']

for col in X.columns:
    sk = data[col].skew()
    print(f"Column [{col}] is {sk:.2f}", end = '\t')
    if sk > - 0.5 and sk < 0.5:
        print(f"{col} is Symmetric")
        not_skew.append(col)
    elif sk > -1 and sk < -1:
        print(f"{col} is Moderately Skewed")
        skew.append(col)
    else:
        print(f"{col} is Skewed")
        skew.append(col)
        

In [None]:
def iqr_method(data, cols):
    results = {}
    for col in cols:
        q1 = data[col].quantile(0.25)
        q3 = data[col].quantile(0.75)
    
        iqr = q3 - q1
        lower = q1 - (1.5 * iqr)
        upper = q3 + (1.5 * iqr)
        outlier = data[(data[col] < lower) | (data[col] > upper)]
        results[col] = len(outlier)
        print(f"The {col} : {len(outlier)} outliers or {(len(outlier)/len(data))* 100} % outliers")
        data[col] = np.where(data[col] > upper, upper, np.where(data[col] < lower, lower, data[col]))
        print("After Imputation: ")
        outlier = data[(data[col] < lower) | (data[col] > upper)]
        print(f"The {col} : {len(outlier)} outliers or {(len(outlier)/len(data)) * 100} % outliers")
        print("--------------------------------------------------------------------")
    # return outlier

In [None]:
print(f"Columns that are not skew are {len(not_skew)}: ")
print(not_skew)
print("-----------------------------------")
print(f"Columns that are skewed {len(skew)}: ")
print(skew)

## Skewed Data

In [None]:
# Before outlier Removal
plt.figure(figsize=(14, 6))
k=1
for col in skew:
    plt.subplot(2, 4, k)
    sns.boxplot(data= data, x=col, orient='v')
    plt.xlabel(col)
    plt.grid()
    plt.legend()
    k = k+1
plt.tight_layout()
plt.show()
    

In [None]:
# skew_out = 
iqr_method(data, skew)

In [None]:
# After Outlier Removal
plt.figure(figsize=(14, 6))
k=1
for col in skew:
    plt.subplot(3, 3, k)
    sns.boxplot(data= data, x=col, orient='v')
    plt.xlabel(col)
    plt.grid()
    plt.legend()
    k = k+1
plt.tight_layout()
plt.show()
    

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
plt.figure(figsize=(18, 8))
k=1
for col in skew:
    upper, lower = iqr_limits(data, col)
    plt.subplot(4, 2, k)
    sns.kdeplot(data[col], fill = True, label=f"Imputed {col}")
    sns.kdeplot(data2[col], fill = True, label=col)
    plt.axvline(x=lower, linestyle='--', color='r', label='Lower Limit')
    plt.axvline(x=upper, linestyle='--', color='g', label='Upper Limit')
    plt.xlabel(col)
    plt.grid()

    # plt.xlim([(data2[col].min()) , data2[col].max()])
    plt.legend()
    k +=1

plt.tight_layout()
plt.show()
    

Using Box Plot to Visualize the Outliers. 

## Non-skewed Data

In [None]:
# Before outlier Removal
plt.figure(figsize=(26, 18))
k=1
for col in not_skew:
    plt.subplot(7, 4, k)
    sns.boxplot(data= data, x=col, orient='v')
    plt.xlabel(col)
    plt.grid()
    plt.legend()
    k = k+1
plt.tight_layout()
plt.show()
    

In [None]:
def detect_using_zscore(data, col, threshold=3):
    
    mean = data[col].mean()
    std = data[col].std()
        
    if std == 0:
        print(f"Column '{col}' has zero standard deviation. No outliers possible.")
        return 0,0

    lower_limit = mean - threshold * std
    upper_limit = mean + threshold * std
    return lower_limit, upper_limit

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
plt.figure(figsize=(48, 30))
k=1
for col in not_skew:
    upper, lower = iqr_limits(data, col)
    plt.subplot(9, 3, k)
    # sns.kdeplot(data[col], fill = True, label=f"Imputed {col}")
    sns.kdeplot(data[col], fill = True, label=col)
    plt.axvline(x=lower, linestyle='--', color='r', label='Lower Limit')
    plt.axvline(x=upper, linestyle='--', color='g', label='Upper Limit')
    plt.xlabel(col)
    plt.grid()

    # plt.xlim([(data2[col].min()) , data2[col].max()])
    plt.legend()
    k +=1

plt.tight_layout()
plt.show()
    

In [None]:
import pandas as pd
import numpy as np

def detect_and_impute_outliers(data, columns, threshold=3):
    if isinstance(columns, set):
        columns = list(columns)
    
    for col in columns:
        if col not in data.columns:
            print(f"Column '{col}' not found in data. Skipping.")
            continue
        
        mean = data[col].mean()
        std = data[col].std()
        
        if std == 0:
            print(f"Column '{col}' has zero standard deviation. No outliers possible.")
            continue
        
        lower_limit = mean - threshold * std
        upper_limit = mean + threshold * std
        
        # Count initial outliers
        num_outliers = ((data[col] < lower_limit) | (data[col] > upper_limit)).sum()
        
        print(f"For column '{col}':")
        # print(f"  Mean: {mean:.4f}")
        # print(f"  Std: {std:.4f}")
        print(f"  Lower limit (outlier if below): {lower_limit:.4f}")
        print(f"  Upper limit (outlier if above): {upper_limit:.4f}")
        print(f"  Number of outliers before imputation: {num_outliers}")
        
        # Impute by clipping
        data[col] = np.where(data[col] > upper_limit, upper_limit, np.where(data[col] < lower_limit, lower_limit, data[col]))
        
        # Recount outliers to confirm
        num_outliers_after = ((data[col] < lower_limit) | (data[col] > upper_limit)).sum()
        print(f"  Number of outliers after imputation: {num_outliers_after}")
        print("---"*15)

In [None]:
detect_and_impute_outliers(data, not_skew)

# Data Visulization

In [None]:
import seaborn as sns

v = []
for i in range(1, 29):
    add = "V" + str(i)
    v.append(add)
    
# from matplotlib import pyplot as plt
fig = plt.figure(figsize=(20, 12))
j = 1
for val in v:
    plt.subplot(4, 7, j)
    sns.kdeplot(data[val], fill = True, label=val)
    plt.xlabel(val)
    plt.grid()
    plt.legend()
    j = j+1

plt.tight_layout()
plt.show()


In [None]:
sns.kdeplot(data=data, x='Amount', fill=True)
plt.grid()

In [None]:
binn = [0, 1000, 2500, 5000, 10000, 15000, 20000, 25000]
labels = ['Under 1k', '1k to 2.5k', '2.5k to 5k', '5k to 10k', '10k to 15k', '15k to 20k', '20k to 25k']
data['Amount_in_bins'] = pd.cut(data['Amount'], labels = labels, bins=binn)

In [None]:
sns.histplot(data['Amount_in_bins'])
plt.xticks(rotation=45)
plt.xlabel("Amount in Categories")
plt.ylabel("No. of Transactions")

In [None]:
plt.figure(figsize=(10,6))
color = ['purple', 'orange', 'purple', 'orange']
(data.groupby('Amount_in_bins')['Class'].value_counts()).plot(kind='bar', color=color)
plt.xticks(rotation=90)
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
data.groupby('Amount_in_bins')['Class'].value_counts()


In [None]:
i=0
fraud_by_bin = data.groupby('Amount_in_bins')['Class'].value_counts()
for val in labels:
    non_fraud = fraud_by_bin[i]/(fraud_by_bin[i] + fraud_by_bin[i+1])
    fraud = fraud_by_bin[i+1]/(fraud_by_bin[i] + fraud_by_bin[i+1])
    print(f"Chances of Fraud when transaction of {val} is made is : {fraud*100:.2f}%")
    print(f"Chances of not_Fraud when transaction of {val} is made is : {non_fraud*100:.2f}%")
    i=i+2
    print("----"*20)
    

In [None]:
all_num = [col for col in data.columns if col != 'Amount_in_bins']
plt.figure(figsize=(20,12), dpi =500)
sns.heatmap(data[all_num].corr(), annot=True,fmt=".2f")


In [None]:
data['Class'].value_counts().plot(kind='pie', labels=[ 'Not Fraud', 'Fraud'])


# Model Building

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = data2.drop('Class', axis=1)
Y = data2['Class']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
st = StandardScaler()
X_train = st.fit_transform(X_train)
X_test = st.transform(X_test)

In [None]:
# Section 3: Machine Learning Model Training

# Import necessary libraries for modeling
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# Initialize a dictionary to hold our models
models = {
    'Logistic Regression': LogisticRegression(solver='liblinear', random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),  # eval_metric for binary classification
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0)  # verbose=0 to suppress training output
}


In [None]:
# Train each model and save its predictions and probabilities
for name, model in models.items():
    print(f'\n--- Training {name} ---')
    
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]  # Probability of the positive class
    
    # Clean model name for file saving
    model_name_cleaned = name.replace(' ', '_').lower()
    
    # Save predictions and probabilities to CSV files in current working directory
    os.makedirs("y_Predictions", exist_ok=True)
    os.makedirs("y_Probabilities", exist_ok=True)
    pd.DataFrame(y_pred).to_csv(f'y_Predictions/y_pred_{model_name_cleaned}.csv', index=False)
    pd.DataFrame(y_proba).to_csv(f'y_Probabilities/y_proba_{model_name_cleaned}.csv', index=False)
    
    print(f'{name} training complete. Predictions and probabilities saved.')

print('\nAll selected models have been trained and their predictions/probabilities saved for evaluation.')

# Model Evaluation

In [None]:
# Import necessary libraries for evaluation
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    ConfusionMatrixDisplay, roc_curve
)

import matplotlib
matplotlib.use("Agg")  # Use a non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns


# Define the list of models for which we have predictions
models_to_evaluate = [
    "logistic_regression",
    "random_forest",
    "xgboost",
    "lightgbm",
    "catboost"
]

In [None]:

results = {}

print("\n--- Evaluating Model Performance ---")

for model_name_cleaned in models_to_evaluate:
    model_display_name = model_name_cleaned.replace("_", " ").title()
    print(f"\nEvaluating {model_display_name}...")

    # Load predictions and probabilities from current working directory
    y_pred = pd.read_csv(f"y_Predictions/y_pred_{model_name_cleaned}.csv").values.ravel()
    y_proba = pd.read_csv(f"y_Probabilities/y_proba_{model_name_cleaned}.csv").values.ravel()

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    results[model_display_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "ROC-AUC": roc_auc
    }

    print(f" Accuracy:  {accuracy:.4f}")
    print(f" Precision: {precision:.4f}")
    print(f" Recall:    {recall:.4f}")
    print(f" F1-Score:  {f1:.4f}")
    print(f" ROC-AUC:   {roc_auc:.4f}")

    # Plot Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])

    fig, ax = plt.subplots(figsize=(6, 6))
    disp.plot(cmap=plt.cm.Blues, ax=ax)
    ax.set_title(f"Confusion Matrix for {model_display_name}")

    os.makedirs("Confusion Matrix", exist_ok=True)
    plt.savefig(f"Confusion Matrix/confusion_matrix_{model_name_cleaned}.png")
    plt.close(fig)  # Close the plot to free memory

    print(f" Confusion matrix saved as confusion_matrix_{model_name_cleaned}.png")




In [None]:
# Display all results in a DataFrame for easy comparison
results_df = pd.DataFrame(results).T

print("\n--- Comprehensive Model Evaluation Results ---")
print(results_df.sort_values(by="F1-Score", ascending=False))

# Save the comprehensive results to a CSV file in current working directory
os.makedirs("Evaluation Result", exist_ok=True)
results_df.to_csv("Evaluation Result/model_evaluation_results.csv")
print("\nAll model evaluation results saved to model_evaluation_results.csv")

# Optional: Plot ROC curves for all models for visual comparison
plt.figure(figsize=(10, 8))

for model_name_cleaned in models_to_evaluate:
    model_display_name = model_name_cleaned.replace("_", " ").title()
    y_proba = pd.read_csv(f"y_Probabilities/y_proba_{model_name_cleaned}.csv").values.ravel()

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(
        fpr, tpr,
        label=f"{model_display_name} (AUC = {roc_auc_score(y_test, y_proba):.2f})"
    )
    plt.plot([0, 1], [0, 1], "k--", label="Random (AUC = 0.50)")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve Comparison")
    plt.legend(loc="lower right")
    plt.grid(True)
    
    
    os.makedirs("figures", exist_ok=True)
    plt.savefig(f"figures/confusion_matrix_{model_name_cleaned}.png")
    plt.close()  # Close the plot to free memory

print("ROC curve comparison plot saved as roc_curve_comparison.png")
