### Importing Libraries and Dataset

In [2]:
import pandas as pd 
from imbalanced_ensemble.sampler.under_sampling import RandomUnderSampler
from imbalanced_ensemble.sampler.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
from sklearn.preprocessing import StandardScaler 
from xgboost import XGBClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier  
from math import ceil

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('Creditcard_data.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
x = data.drop("Class", axis=1) 
y = data["Class"]

### Handling Imbalanced dataset & sample size

In [5]:
smote = SMOTE(random_state=123)
x_smote, y_smote = smote.fit_resample(x, y)

In [6]:
z = 1.96 # Z-score for a 95% confidence interval
p = 0.5
e = 0.05 # Error
sample_size = ceil((z**2 * p * (1 - p)) / (e**2))
print(f"Calculated sample size: {sample_size}")

Calculated sample size: 385


### Creating Samples

In [7]:
samples = []
for i in range(5):
    x_sample, _, y_sample, _ = train_test_split(
        x_smote, y_smote, train_size=sample_size, random_state=i
    )
    samples.append((x_sample, y_sample))

In [8]:
samples

[(      Time        V1        V2        V3        V4        V5        V6  \
  1213   333 -0.032868  0.378548  1.151163  0.391998  0.417750 -0.970412   
  851    528 -2.013309 -2.503569  2.450306  1.154085  2.448545  0.573994   
  694    524 -0.292211  0.838605  1.360847 -0.001346  0.350836 -0.894645   
  137     84 -0.481376  1.003407  0.906184 -0.423864  0.760671  0.377627   
  1009   341  0.584951  0.470574  0.389280  0.363592  0.312997 -0.318167   
  ...    ...       ...       ...       ...       ...       ...       ...   
  763    574 -0.402057  0.584300  2.474227  0.929684  0.014314  0.297490   
  835     89  0.580353  0.421920  0.322539  0.265970  0.477177  0.100179   
  1216   156 -0.158409  0.915774 -0.518012  1.816018 -0.164330 -0.600330   
  559    417 -2.680348  1.872052  1.144712 -0.693664  0.155172  0.601325   
  684    517  1.314713 -0.328688  0.002645 -0.805044 -0.467260 -0.522747   
  
              V7        V8        V9  ...       V20       V21       V22  \
  1213  0.

In [9]:
#sampling techniques - creating samples of differentr sizes
sampling_techniques = [
    lambda X, y: (X, y),  # No resampling
    lambda X, y: RandomUnderSampler(random_state=123).fit_resample(X, y),
    lambda X, y: SMOTE(random_state=123).fit_resample(X, y),
    lambda X, y: SMOTEENN(random_state=123).fit_resample(X, y),
]

### Defining Models

In [10]:
models = [
    LogisticRegression(max_iter=1000, solver="saga", class_weight="balanced"),  
    RandomForestClassifier(class_weight="balanced", random_state=123),  
    EasyEnsembleClassifier(random_state=123),  
    XGBClassifier(scale_pos_weight=len(y[y == 0]) / len(y[y == 1]), use_label_encoder=False),  
    CatBoostClassifier(verbose=0, auto_class_weights="Balanced"),  
]

### Initialize scaler and results

In [11]:
scaler = StandardScaler()
results = []

In [12]:
for i, sample in enumerate(samples):
    X_sample, y_sample = sample
    for j, model in enumerate(models):
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=123)

        # Scaling the data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Apply sampling (except for default technique)
        if i in [1, 2, 3]:
            X_res, y_res = sampling_techniques[i](X_train_scaled, y_train)
        else:
            X_res, y_res = X_train_scaled, y_train

        # Fit the model
        model.fit(X_res, y_res)

        # Evaluate the model
        y_pred = model.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        results.append((f"Sampling{i+1}", f"Model{j+1}", acc))


### Evaluating Parameters

In [16]:
results_df = pd.DataFrame(results,columns=["Sampling Technique", "Model", "Accuracy"])
results_df.to_csv("results_102203103.csv", index=False)

In [17]:
results_df

Unnamed: 0,Sampling Technique,Model,Accuracy
0,Sampling1,Model1,0.931034
1,Sampling1,Model2,0.982759
2,Sampling1,Model3,0.982759
3,Sampling1,Model4,0.974138
4,Sampling1,Model5,0.991379
5,Sampling2,Model1,0.939655
6,Sampling2,Model2,0.982759
7,Sampling2,Model3,0.982759
8,Sampling2,Model4,0.956897
9,Sampling2,Model5,0.982759


In [18]:
best_results = (results_df.groupby("Model")[["Accuracy"]].max())

In [19]:
best_results.to_csv("best_results_102203103.csv", index=False)
print(best_results)

        Accuracy
Model           
Model1  0.939655
Model2  0.991379
Model3  0.982759
Model4  0.974138
Model5  0.991379
