# Sampling


In [4]:


import numpy as np
import pandas as p
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.cluster import KMeans

# MODELS USED
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from imblearn.combine import SMOTEENN

# 1. LOAD DATASET
df = pd.read_csv("Creditcard_data.csv")

X = df.drop("Class", axis=1)
y = df["Class"]

# 2. FEATURE ENGINEERING

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=15, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# 3. BALANCE DATASET
smote_enn = SMOTEENN(random_state=42)
X_bal, y_bal = smote_enn.fit_resample(X_pca, y)

data = pd.DataFrame(X_bal)
data["Class"] = y_bal.values

print("Balanced Dataset Distribution:")
print(data["Class"].value_counts())

Balanced Dataset Distribution:
Class
1    762
0    695
Name: count, dtype: int64


In [5]:
# 4. SAMPLING TECHNIQUES


def simple_random_sampling(df):
    return df.sample(frac=0.8, random_state=42)

def systematic_sampling(df):
    return df.iloc[::3]

def stratified_sampling(df):
    return df.groupby("Class", group_keys=False).apply(
        lambda x: x.sample(frac=0.8, random_state=42)
    )

def cluster_sampling(df):
    kmeans = KMeans(n_clusters=4, random_state=42)
    df = df.copy()
    df["cluster"] = kmeans.fit_predict(df.drop("Class", axis=1))
    return df[df["cluster"] == 0].drop("cluster", axis=1)

def bootstrap_sampling(df):
    return df.sample(n=len(df), replace=True, random_state=42)


In [6]:
sampling_methods = {
    "SimpleRandom": simple_random_sampling(data),
    "Systematic": systematic_sampling(data),
    "Stratified": stratified_sampling(data),
    "Cluster": cluster_sampling(data),
    "Bootstrap": bootstrap_sampling(data)
}

  return df.groupby("Class", group_keys=False).apply(


In [7]:
# 5. MODELS


models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "GradientBoosting": GradientBoostingClassifier(),
    "NaiveBayes": GaussianNB(),
    "RandomForest": RandomForestClassifier(
        n_estimators=200, max_depth=12, random_state=42
    ),
    "KNN": KNeighborsClassifier(n_neighbors=7)
}

In [8]:
# 6. EVALUATION


results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for samp_name, samp_data in sampling_methods.items():
    X_s = samp_data.drop("Class", axis=1)
    y_s = samp_data["Class"]

    for model_name, model in models.items():
        scores = cross_val_score(
            model,
            X_s,
            y_s,
            cv=skf,
            scoring="accuracy"
        )
        results.loc[model_name, samp_name] = round(scores.mean() * 100, 2)
        # 7. RESULTS

print("\n================ FINAL ACCURACY TABLE ================\n")
print(results)

print("\n======= BEST SAMPLING TECHNIQUE PER MODEL =======\n")
for model in results.index:
    best_sampling = results.loc[model].astype(float).idxmax()
    best_score = results.loc[model].astype(float).max()
    print(f"{model} → {best_sampling} ({best_score:.2f}%)")



                   SimpleRandom Systematic Stratified Cluster Bootstrap
LogisticRegression        81.39      86.63      83.97   88.89     88.33
GradientBoosting          98.63      96.91      98.46   99.69     98.83
NaiveBayes                82.51      80.66      81.82   85.41      83.6
RandomForest              97.69      98.15      97.43   99.39     98.28
KNN                       94.77      88.06      93.74   95.44     96.57


LogisticRegression → Cluster (88.89%)
GradientBoosting → Cluster (99.69%)
NaiveBayes → Cluster (85.41%)
RandomForest → Cluster (99.39%)
KNN → Bootstrap (96.57%)
