In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler
from sklearn.cluster import KMeans

data = pd.read_csv(
    "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
)

X = data.drop("Class", axis=1)
y = data["Class"]
print("Original Class Distribution:")
print(data['Class'].value_counts())
print()

Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64



In [2]:
ros = RandomOverSampler(random_state=42)
X_bal, y_bal=ros.fit_resample(X, y)
balanced_df=pd.concat([X_bal, y_bal], axis=1)

print(f"Balanced dataset size: {len(balanced_df)}")
print(f"Balanced class distribution:\n{balanced_df['Class'].value_counts()}")
print()

Balanced dataset size: 1526
Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64



In [3]:
train_df, test_df = train_test_split(
    balanced_df, test_size=0.3, random_state=42, stratify=balanced_df['Class']
)

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

Training set size: 1068
Test set size: 458


In [4]:
def simple_random(df, frac=0.7):
    return df.sample(frac=frac, random_state=42)

def stratified(df, frac=0.7):
    return df.groupby("Class", group_keys=False).apply(
        lambda x: x.sample(frac=frac, random_state=42)
    )

def systematic(df, step=2):
    return df.iloc[::step].reset_index(drop=True)

def cluster_sampling(df, n_clusters=10, pick=5):
    features = df.drop("Class", axis=1)
    kmeans=KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df_copy=df.copy()
    df_copy["cluster"] = kmeans.fit_predict(features)
    np.random.seed(42)
    chosen=np.random.choice(n_clusters, pick, replace=False)
    return df_copy[df_copy["cluster"].isin(chosen)].drop("cluster", axis=1)

def bootstrap(df, frac=0.7):
    n = int(frac * len(df))
    return df.sample(n=n, replace=True, random_state=42)

In [5]:
sample1 = simple_random(train_df, frac=0.7)
print(f"Sampling1: {len(sample1)} samples")
sample2 = stratified(train_df, frac=0.7)
print(f"Sampling2: {len(sample2)} samples")
sample3 = systematic(train_df, step=2)
print(f"Sampling3: {len(sample3)} samples")
sample4 = cluster_sampling(train_df, n_clusters=10, pick=5)
print(f"Sampling4: {len(sample4)} samples")
sample5 = bootstrap(train_df, frac=0.7)
print(f"Sampling5: {len(sample5)} samples")
samples = {
    "Sampling1": sample1,
    "Sampling2": sample2,
    "Sampling3": sample3,
    "Sampling4": sample4,
    "Sampling5": sample5
}
print()

Sampling1: 748 samples
Sampling2: 748 samples
Sampling3: 534 samples
Sampling4: 840 samples
Sampling5: 747 samples



  return df.groupby("Class", group_keys=False).apply(


In [6]:
models = {
    "M1": LogisticRegression(max_iter=3000, C=0.5, random_state=42),
    "M2": DecisionTreeClassifier(max_depth=5, random_state=42),
    "M3": RandomForestClassifier(
        n_estimators=100,
        max_depth=6,
        min_samples_split=10,
        random_state=42
    ),
    "M4": GaussianNB(var_smoothing=1e-8),
    "M5": SVC(C=0.5, kernel="rbf", random_state=42)
}

In [9]:
results=pd.DataFrame(index=models.keys(), columns=samples.keys())
X_test=test_df.drop("Class", axis=1)
y_test=test_df["Class"]
for samp_name, train_samp_df in samples.items():
    X_train=train_samp_df.drop("Class", axis=1)
    y_train=train_samp_df["Class"]

    for model_name, model in models.items():
        if model_name in ["M1", "M5"]:
            scaler=StandardScaler()
            X_train_scaled=scaler.fit_transform(X_train)
            X_test_scaled=scaler.transform(X_test)
            model.fit(X_train_scaled, y_train)
            y_pred=model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            y_pred=model.predict(X_test)

        accuracy=accuracy_score(y_test, y_pred) * 100
        results.loc[model_name, samp_name] = round(accuracy, 2)
        print(f"  {model_name}: {accuracy:.2f}%")

    print()


  M1: 90.39%
  M2: 96.94%
  M3: 99.56%
  M4: 75.76%
  M5: 96.94%

  M1: 91.48%
  M2: 96.94%
  M3: 99.34%
  M4: 77.95%
  M5: 96.94%

  M1: 89.96%
  M2: 97.60%
  M3: 99.34%
  M4: 81.88%
  M5: 96.29%

  M1: 87.77%
  M2: 91.27%
  M3: 92.79%
  M4: 86.24%
  M5: 90.61%

  M1: 89.96%
  M2: 97.82%
  M3: 99.56%
  M4: 67.69%
  M5: 96.51%



In [10]:
print("ACCURACY RESULTS TABLE (%)")
print(results)
print("BEST SAMPLING TECHNIQUE FOR EACH MODEL")
best_per_model=results.astype(float).idxmax(axis=1)
for model, best_samp in best_per_model.items():
    best_acc=results.loc[model, best_samp]
    print(f"{model}: {best_samp} (Accuracy: {best_acc}%)")
print()


ACCURACY RESULTS TABLE (%)
   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1     90.39     91.48     89.96     87.77     89.96
M2     96.94     96.94      97.6     91.27     97.82
M3     99.56     99.34     99.34     92.79     99.56
M4     75.76     77.95     81.88     86.24     67.69
M5     96.94     96.94     96.29     90.61     96.51
BEST SAMPLING TECHNIQUE FOR EACH MODEL
M1: Sampling2 (Accuracy: 91.48%)
M2: Sampling5 (Accuracy: 97.82%)
M3: Sampling1 (Accuracy: 99.56%)
M4: Sampling4 (Accuracy: 86.24%)
M5: Sampling1 (Accuracy: 96.94%)

