In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# 1. Load the dataset
# Ensure the path matches your folder structure
df = pd.read_csv('../data/Creditcard_data.csv')

# 2. Check initial class distribution
print(f"Original dataset shape: {Counter(df['Class'])}")

# 3. Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# 4. Balance the dataset using RandomOverSampler
# This will increase the minority class to match the majority class
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X, y)

# 5. Create the final balanced dataframe
balanced_df = pd.concat([pd.DataFrame(X_balanced), pd.Series(y_balanced, name='Class')], axis=1)

print(f"Balanced dataset shape: {Counter(balanced_df['Class'])}")
balanced_df.head()

Original dataset shape: Counter({0: 763, 1: 9})
Balanced dataset shape: Counter({0: 763, 1: 763})


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [2]:
import numpy as np

# Set sample size
n = 350 

# 1. Simple Random Sampling
s1 = balanced_df.sample(n=n, random_state=1)

# 2. Systematic Sampling
# Calculate interval k
k = len(balanced_df) // n
s2 = balanced_df.iloc[::k][:n]

# 3. Stratified Sampling 
# Ensures equal representation of Class 0 and 1 in the sample
s3 = balanced_df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n // 2, random_state=3))

# 4. Cluster Sampling
# We'll create 10 clusters and pick 2-3 randomly
num_clusters = 10
balanced_df['cluster'] = np.repeat(np.arange(num_clusters), len(balanced_df) // num_clusters + 1)[:len(balanced_df)]
selected_clusters = np.random.choice(np.arange(num_clusters), size=3, replace=False)
s4 = balanced_df[balanced_df['cluster'].isin(selected_clusters)].sample(n=n, random_state=4)
# Drop the helper column after sampling
s4 = s4.drop(columns=['cluster'])
balanced_df = balanced_df.drop(columns=['cluster'])

# 5. Bootstrap Sampling
# Sampling with replacement
s5 = balanced_df.sample(n=n, replace=True, random_state=5)

print(f"Sample Sizes: S1={len(s1)}, S2={len(s2)}, S3={len(s3)}, S4={len(s4)}, S5={len(s5)}")

Sample Sizes: S1=350, S2=350, S3=350, S4=350, S5=350


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# 1. Prepare global test set (ensure we use the original balanced_df features)
X_prep = balanced_df.drop('Class', axis=1)
y_prep = balanced_df['Class']
X_train_full, X_test_global, y_train_full, y_test_global = train_test_split(
    X_prep, y_prep, test_size=0.2, random_state=42
)

# 2. Define the 5 Models
models = {
    "M1": LogisticRegression(max_iter=2000),
    "M2": DecisionTreeClassifier(random_state=42),
    "M3": RandomForestClassifier(random_state=42),
    "M4": SVC(random_state=42),
    "M5": KNeighborsClassifier()
}

samples = [s1, s2, s3, s4, s5]
sample_names = ['Sampling1', 'Sampling2', 'Sampling3', 'Sampling4', 'Sampling5']
results_table = pd.DataFrame(index=models.keys(), columns=sample_names)

# 3. Loop through models and samples with a safety check
for model_key, model in models.items():
    for i, sample in enumerate(samples):
        # Safety Check: Ensure 'Class' is actually in this specific sample
        if 'Class' not in sample.columns:
            # If it's missing, we grab the labels from the balanced_df using the sample's index
            y_sample = balanced_df.loc[sample.index, 'Class']
            X_sample = sample.drop(columns=[c for c in sample.columns if c not in X_prep.columns])
        else:
            X_sample = sample.drop('Class', axis=1)
            y_sample = sample['Class']
        
        # Train and Predict
        model.fit(X_sample, y_sample)
        predictions = model.predict(X_test_global)
        accuracy = accuracy_score(y_test_global, predictions) * 100
        results_table.loc[model_key, sample_names[i]] = f"{accuracy:.2f}"

print("Final Accuracy Table:")
print(results_table)

Final Accuracy Table:
   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1     91.83     92.81     92.48     86.27     93.46
M2     97.06     99.35     96.73     92.81     97.06
M3     99.35    100.00    100.00     98.69     99.02
M4     63.73     66.01     64.05     70.26     73.86
M5     93.46     94.12     93.46     74.84     93.79
