In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import RandomOverSampler
data_url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(data_url)

print("Original class distribution:")
print(df["Class"].value_counts())


Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64


In [2]:
X = df.drop(columns=["Class"])
y = df["Class"]

oversampler = RandomOverSampler(random_state=1)
X_bal, y_bal = oversampler.fit_resample(X, y)

balanced_data = pd.concat([X_bal, y_bal], axis=1)

print("\nBalanced class distribution:")
print(balanced_data["Class"].value_counts())



Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64


In [3]:
N = len(balanced_data)
error_rate = 0.05
sample_size = int(N / (1 + N * (error_rate ** 2)))

print("Calculated sample size:", sample_size)


Calculated sample size: 316


In [4]:
#simple random sampling
def random_sample(df, size):
    return df.sample(n=size, random_state=42)


In [5]:
#systematic sampling
def systematic_sample(df, size):
    interval = len(df) // size
    idx = np.arange(0, len(df), interval)
    return df.iloc[idx[:size]]


In [6]:
#stratified sampling
def stratified_sample(df, size):
    per_class = size // df["Class"].nunique()
    return df.groupby("Class", group_keys=False).apply(
        lambda x: x.sample(min(len(x), per_class), random_state=42)
    )


In [7]:
#cluster sampling
def cluster_sample(df):
    df_temp = df.copy()
    df_temp["cluster"] = pd.cut(df_temp["V1"], bins=10, labels=False)

    while True:
        chosen = np.random.choice(df_temp["cluster"].unique())
        subset = df_temp[df_temp["cluster"] == chosen]
        if subset["Class"].nunique() > 1:
            return subset.drop(columns=["cluster"])


In [8]:
#bootstrap sampling
def bootstrap_sample(df, size):
    return df.sample(n=size, replace=True, random_state=42)


In [9]:
#creating 5 samples
sample_sets = {
    "Sampling1_Random": random_sample(balanced_data, sample_size),
    "Sampling2_Systematic": systematic_sample(balanced_data, sample_size),
    "Sampling3_Stratified": stratified_sample(balanced_data, sample_size),
    "Sampling4_Cluster": cluster_sample(balanced_data),
    "Sampling5_Bootstrap": bootstrap_sample(balanced_data, sample_size)
}


  return df.groupby("Class", group_keys=False).apply(


In [11]:
#defining 5 ML models
models = {
    "M1_Logistic": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_SVM": SVC(),
    "M5_KNN": KNeighborsClassifier()
}


In [12]:
final_results = {}

for sample_name, sample_df in sample_sets.items():
    X_sample = sample_df.drop(columns=["Class"])
    y_sample = sample_df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    final_results[sample_name] = {}

    for model_name, model in models.items():
        if model_name in ["M1_Logistic", "M4_SVM", "M5_KNN"]:
            model.fit(X_train_scaled, y_train)
            predictions = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

        acc = accuracy_score(y_test, predictions)
        final_results[sample_name][model_name] = round(acc * 100, 2)
accuracy_df = pd.DataFrame(final_results)
print("\nAccuracy Table:")
print(accuracy_df)
print("\nBest Sampling Technique for each Model:")
print(accuracy_df.idxmax(axis=1))



Accuracy Table:
                 Sampling1_Random  Sampling2_Systematic  Sampling3_Stratified  \
M1_Logistic                 84.38                 85.94                 90.62   
M2_DecisionTree             92.19                 96.88                 96.88   
M3_RandomForest            100.00                100.00                100.00   
M4_SVM                      90.62                 93.75                 98.44   
M5_KNN                      90.62                 85.94                 92.19   

                 Sampling4_Cluster  Sampling5_Bootstrap  
M1_Logistic                  100.0                92.19  
M2_DecisionTree              100.0               100.00  
M3_RandomForest              100.0               100.00  
M4_SVM                       100.0               100.00  
M5_KNN                       100.0                98.44  

Best Sampling Technique for each Model:
M1_Logistic        Sampling4_Cluster
M2_DecisionTree    Sampling4_Cluster
M3_RandomForest     Sampling1_Ran