# import libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:


# Load the dataset
data = pd.read_csv("creditcard.csv")


# Data Preprocessing

In [None]:
# Check for null values
print(data.isnull().sum())


In [None]:
# Features and target
X = data.drop("Class", axis=1)
y = data["Class"]


# train test split

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)


# EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data['Class'])
plt.title('Class Distribution (0 = Non-Fraud, 1 = Fraud)')
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), cmap='coolwarm')
plt.title('Feature Correlation')
plt.show()


#  Model Selection and Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Dictionary of models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(eval_metric='logloss', use_label_encoder=False)
}


# Evaluation

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Training and evaluation
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    print(f"Model: {name}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
    print("-" * 50)


# Hyperparameter Tunning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
import joblib

# Save the model and scaler
final_model = RandomForestClassifier(n_estimators=100, max_depth=10)
final_model.fit(X_train, y_train)
joblib.dump(final_model, "credit_fraud_model.pkl")
joblib.dump(scaler, "scaler.pkl")
