In [None]:
import warnings

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, auc

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
credit_data = pd.read_csv("creditcard.csv")

In [None]:
print(credit_data.info())
credit_data.describe()

In [None]:
print(credit_data.skew())

In [None]:
X = credit_data.drop('Class', axis=1)
y = credit_data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

g= sns.histplot(credit_data["Class"])
g.set_xticks([0,1])
# g.set_xticklabels([0,1])

In [None]:
def multi_hist_plot(df_train,df_test, df_train_label, df_test_label, feature):
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 12))

    sns.histplot(data = df_train , ax=axes[0], x = feature , hue = df_train_label,palette = sns.color_palette(["yellow" , "green",'red','blue','black','orange','purple']) ,multiple = "stack" ).set_title(f"{feature} Vs ")
    axes[0].set_title('Histogram of Train Data '+feature)

    sns.histplot(data = df_test , ax=axes[1], x = feature , hue = df_test_label,palette = sns.color_palette(["yellow" , "green",'red','blue','black','orange','purple']) ,multiple = "stack" ).set_title(f"{feature} Vs ")
    axes[1].set_title('Histogram of Test Data '+feature)
    plt.tight_layout()  # Adjust layout to prevent overlap
    
    plt.show()

In [None]:
cols=X_train.columns.tolist()
for i in cols:
    multi_hist_plot(X_train,X_test, y_train, y_test, i)

In [None]:
def compute_correlation(X, y):
    train_corrs = {}
    for feature in X.columns:
        train_corrs[feature] = pearsonr(X[feature], y)[0]
    train_corrs = {k: v for k, v in sorted(train_corrs.items(), key=lambda item: item[1], reverse=True)}
    return train_corrs

In [None]:
compute_correlation(X_train, y_train)

In [None]:
compute_correlation(X_test, y_test)

In [None]:


# Impute missing values in the features
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test) 

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
     

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Neural Network': MLPClassifier()
}

In [None]:
threshold = 0.5  # Set a threshold to determine fraud (1) and non-fraud (0)
y_train_binary = (y_train > threshold).astype(int)

# Model training and evaluation
results = {}
for name, model in models.items():
    # Fit the model to the training data with binary labels
    model.fit(X_train_scaled, y_train_binary)

    if isinstance(model, SVC) and not model.probability:
        # Use decision_function instead
        y_pred_scores = model.decision_function(X_test_scaled)
        y_pred_proba = (y_pred_scores - y_pred_scores.min()) / (y_pred_scores.max() - y_pred_scores.min())  # Normalize scores
    else:
        # Use predict_proba for other models
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)

    results[name] = {'confusion_matrix': cm, 'precision_recall_auc': pr_auc}

In [None]:
accuracies = {}
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy

print("Model Accuracies:")
for name, accuracy in accuracies.items():
    print(f"{name}: {accuracy:.4f}")

In [None]:
for result in results:
    print(f"Confusion matrix results for {result} are \n{results[result]['confusion_matrix']}")
    print(f"AUC for {result} is {results[result]['precision_recall_auc']}")