In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

df = pd.read_csv('/content/Creditcard_data.csv')

print("Original Class Distribution:")
print(df['Class'].value_counts())

Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64


In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

df_balanced = pd.concat([X_resampled, y_resampled], axis=1)

print("Balanced Class Distribution:")
print(df_balanced['Class'].value_counts())

Balanced Class Distribution:
Class
0    763
1    763
Name: count, dtype: int64


In [None]:
Z = 1.96
p = 0.5
E = 0.05

n = int((Z**2 * p * (1-p)) / (E**2))

print(f"Calculated Sample Size (n): {n}")

Calculated Sample Size (n): 384


In [None]:
def simple_random_sampling(df, n):
    return df.sample(n=n, random_state=42)

def systematic_sampling(df, n):
    step = len(df) // n
    indices = np.arange(0, len(df), step)[:n]
    return df.iloc[indices]

def stratified_sampling(df, n):
    return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n // 2)).reset_index(drop=True)

def cluster_sampling(df, n):
    num_clusters = 20
    df_copy = df.copy()
    df_copy['cluster'] = np.random.randint(0, num_clusters, size=len(df))

    selected_clusters = np.random.choice(num_clusters, size=5, replace=False)
    sample = df_copy[df_copy['cluster'].isin(selected_clusters)]

    if len(sample) > n:
        return sample.sample(n=n, random_state=42).drop('cluster', axis=1)
    return sample.drop('cluster', axis=1)

def bootstrap_sampling(df, n):
    return df.sample(n=n, replace=True, random_state=42)

sampling_techniques = {
    'Sampling1 (Simple Random)': simple_random_sampling,
    'Sampling2 (Systematic)': systematic_sampling,
    'Sampling3 (Stratified)': stratified_sampling,
    'Sampling4 (Cluster)': cluster_sampling,
    'Sampling5 (Bootstrap)': bootstrap_sampling
}

In [None]:
models = {
    'M1 (Logistic Regression)': LogisticRegression(max_iter=1000),
    'M2 (Decision Tree)': DecisionTreeClassifier(random_state=42),
    'M3 (Random Forest)': RandomForestClassifier(random_state=42),
    'M4 (SVM)': SVC(),
    'M5 (Naive Bayes)': GaussianNB()
}

X_train_full, X_test, y_train_full, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
df_train = pd.concat([X_train_full, y_train_full], axis=1)

results = {model_name: {} for model_name in models.keys()}

for sample_name, sample_func in sampling_techniques.items():
    sample_data = sample_func(df_train, n)
    X_sample = sample_data.drop('Class', axis=1)
    y_sample = sample_data['Class']

    for model_name, model in models.items():
        model.fit(X_sample, y_sample)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name][sample_name] = round(accuracy * 100, 2)

print("Training completed.")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n // 2)).reset_index(drop=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn

Training completed.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
result_df = pd.DataFrame(results).T

print("Model Accuracy (%) per Sampling Technique:")
print(result_df)

print("\n--------------------------------------------------")
print("Which sampling technique gives higher accuracy on which model?")
print("--------------------------------------------------")

for model in result_df.index:
    best_sample = result_df.loc[model].idxmax()
    best_acc = result_df.loc[model].max()
    print(f"For {model}, the best technique is {best_sample} with Accuracy: {best_acc}%")

Model Accuracy (%) per Sampling Technique:
                          Sampling1 (Simple Random)  Sampling2 (Systematic)  \
M1 (Logistic Regression)                      92.16                   92.16   
M2 (Decision Tree)                            97.71                   97.71   
M3 (Random Forest)                            99.67                   99.67   
M4 (SVM)                                      64.71                   63.40   
M5 (Naive Bayes)                              76.80                   68.30   

                          Sampling3 (Stratified)  Sampling4 (Cluster)  \
M1 (Logistic Regression)                   89.87                92.48   
M2 (Decision Tree)                         96.08                97.71   
M3 (Random Forest)                         99.35                98.69   
M4 (SVM)                                   65.69                63.73   
M5 (Naive Bayes)                           81.05                67.65   

                          Sampling5 (Bootst