In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [23]:
# Load dataset
df = pd.read_csv("C:\\Users\\Hp\\Desktop\\creditcard\\creditcard.csv")

In [24]:
sc = StandardScaler()
df['Amount'] = sc.fit_transform(pd.DataFrame(df['Amount']))

In [25]:
df = df.drop(['Time'], axis =1)

In [26]:
df = df.drop_duplicates()

In [27]:
# Split features and target
X = df.drop(columns=["Class"])  # Features
y = df["Class"]  # Target (0 = Legit, 1 = Fraud)


In [28]:

from sklearn.model_selection import cross_val_predict, cross_val_score



In [29]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)


In [13]:
# Perform 5-fold cross-validation
y_pred = cross_val_predict(rf_model, X, y, cv=5)


In [14]:
# Print evaluation metrics
print("Evaluation with 5-Fold Cross-Validation:")
print(classification_report(y, y_pred))

Evaluation with 5-Fold Cross-Validation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    275190
           1       0.87      0.76      0.81       473

    accuracy                           1.00    275663
   macro avg       0.93      0.88      0.90    275663
weighted avg       1.00      1.00      1.00    275663



In [15]:
# Compute F1 scores for each fold
cv_f1_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1')
print("\nCross-Validation F1 Scores:", cv_f1_scores)
print("Mean F1 Score:", round(cv_f1_scores.mean(), 4))


Cross-Validation F1 Scores: [0.7627907  0.88235294 0.72189349 0.89142857 0.79746835]
Mean F1 Score: 0.8112


In [30]:
#  Define hyperparameter grid
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}



In [31]:
#  Stratified 5-Fold Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [32]:
# Perform Randomized Search (Reduced iterations & jobs)
random_search = RandomizedSearchCV(
    rf_model, param_distributions=param_grid, 
    n_iter=5, cv=cv, scoring='f1', n_jobs=-1, random_state=42
)

In [33]:
# Fit with smaller data
random_search.fit(X, y)

In [34]:
# ✅ Best model and parameters
best_rf = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# ✅ Evaluate using cross-validation
y_pred = cross_val_predict(best_rf, X, y, cv=cv)

# ✅ Print classification report
print("\nClassification Report:")
print(classification_report(y, y_pred))

Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 20}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    275190
           1       0.94      0.77      0.85       473

    accuracy                           1.00    275663
   macro avg       0.97      0.89      0.92    275663
weighted avg       1.00      1.00      1.00    275663



In [35]:
from imblearn.under_sampling import RandomUnderSampler
# ✅ Apply Undersampling to Balance Classes
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

In [36]:
from imblearn.pipeline import Pipeline as ImbPipeline
# Define classifier and hyperparameter search space
rf = RandomForestClassifier(random_state=42)
# ✅ Define the pipeline (Undersampling + RF)
pipeline = ImbPipeline([
    ("undersample", undersampler),
    ("rf", rf)
])

In [37]:
# ✅ Define hyperparameter search space
param_grid = {
    "rf__n_estimators": [50, 100, 200],
    "rf__max_depth": [10, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4]
}


In [38]:
# ✅ RandomizedSearchCV with Pipeline
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=10,  # Limits number of parameter combinations
    scoring="f1",
    n_jobs=-1,
    cv=cv,
    verbose=1,
    random_state=42
)

In [39]:
# ✅ Fit RandomizedSearchCV
random_search.fit(X_resampled, y_resampled)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [41]:
# ✅ Best model evaluation
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_resampled)  # Predictions on resampled dataset

# ✅ Print results
print("Best Parameters:", random_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_resampled, y_pred))


Best Parameters: {'rf__n_estimators': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2, 'rf__max_depth': 10}

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       473
           1       1.00      0.97      0.98       473

    accuracy                           0.99       946
   macro avg       0.99      0.99      0.99       946
weighted avg       0.99      0.99      0.99       946



In [45]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))  # or XGBClassifier
])

In [46]:
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions={
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2],
    },
    n_iter=10,
    scoring='f1',
    cv=cv,
    n_jobs=-1,
    random_state=42
)

In [47]:
random_search.fit(X, y)

In [48]:
# Predict with cross-validation
y_pred = cross_val_predict(random_search.best_estimator_, X, y, cv=cv)

# Output results
print("Best Parameters:", random_search.best_params_)
print("\nClassification Report:")
print(classification_report(y, y_pred))

Best Parameters: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 20}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    275190
           1       0.80      0.82      0.81       473

    accuracy                           1.00    275663
   macro avg       0.90      0.91      0.91    275663
weighted avg       1.00      1.00      1.00    275663

