In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans


In [29]:
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

X = df.drop("Class", axis=1)
y = df["Class"]

print("Original class distribution:")
print(y.value_counts())
print()


Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64



In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


In [31]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

train_df = pd.concat([X_train_bal, y_train_bal], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)


In [32]:
def simple_random(df, frac=0.7):
    return df.sample(frac=frac, random_state=42)

def stratified(df, frac=0.7):
    return df.groupby("Class", group_keys=False).sample(frac=frac, random_state=42)

def systematic(df, step=2):
    return df.iloc[::step]

def cluster(df, k=6, choose=3):
    X_feat = df.drop("Class", axis=1)
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    temp = df.copy()
    temp["cluster"] = km.fit_predict(X_feat)
    selected = np.random.choice(temp["cluster"].unique(), choose, replace=False)
    return temp[temp["cluster"].isin(selected)].drop("cluster", axis=1)

def full_smote(df):
    return df


In [33]:
samples = {
    "Sampling1": simple_random(train_df),
    "Sampling2": stratified(train_df),
    "Sampling3": systematic(train_df),
    "Sampling4": cluster(train_df),
    "Sampling5": full_smote(train_df)
}


In [34]:
models = {
    "M1": LogisticRegression(max_iter=2000),
    "M2": DecisionTreeClassifier(max_depth=5),
    "M3": RandomForestClassifier(n_estimators=80, max_depth=6),
    "M4": KNeighborsClassifier(n_neighbors=7),
    "M5": GaussianNB()
}


In [35]:
results = pd.DataFrame(index=models.keys(), columns=samples.keys())

scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(test_df.drop("Class", axis=1))
y_test = test_df["Class"]

for samp_name, samp_df in samples.items():
    X_train = samp_df.drop("Class", axis=1)
    y_train = samp_df["Class"]

    X_train_scaled = scaler.fit_transform(X_train)

    for model_name, model in models.items():
        if model_name in ["M1", "M4"]:
            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

        acc = accuracy_score(y_test, preds) * 100
        results.loc[model_name, samp_name] = round(acc, 2)


In [36]:
print("ACCURACY RESULTS TABLE (%)")
print(results)

print("\nBEST SAMPLING TECHNIQUE FOR EACH MODEL")
best = results.astype(float).idxmax(axis=1)

for m in results.index:
    print(f"{m}: {best[m]} ({results.loc[m, best[m]]}%)")



ACCURACY RESULTS TABLE (%)
   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1     56.47     59.48     48.28     60.34     54.74
M2     93.53     90.52     90.52      93.1     90.95
M3     98.71     98.71     98.28     98.71     98.28
M4     77.59     73.28     72.84     83.62     81.47
M5     96.12     96.55     96.98      94.4     95.26

BEST SAMPLING TECHNIQUE FOR EACH MODEL
M1: Sampling4 (60.34%)
M2: Sampling1 (93.53%)
M3: Sampling1 (98.71%)
M4: Sampling4 (83.62%)
M5: Sampling3 (96.98%)
