<a href="https://colab.research.google.com/github/Aksh444/Assignment-Sampling/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('Creditcard_data.csv')

# Check class distribution
print(df['Class'].value_counts())

# Balance dataset using SMOTE
X = df.drop(columns=['Class'])
y = df['Class']
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Scale features
scaler = StandardScaler()
X_balanced = scaler.fit_transform(X_balanced)

# Define sample size (Assume a fixed proportion for now)
sample_size = int(0.2 * len(X_balanced))

# Create five different samples
samples = [pd.DataFrame(X_balanced).sample(n=sample_size, random_state=i) for i in range(5)]
labels = [y_balanced.loc[sample.index] for sample in samples]

# Define five sampling techniques (examples: random, stratified, systematic, etc.)
def sampling_technique1(X, y): return X, y  # Example: No modification
def sampling_technique2(X, y): return X.sample(frac=0.8, random_state=1), y.sample(frac=0.8, random_state=1)
def sampling_technique3(X, y): return X.iloc[::2], y.iloc[::2]  # Example: Systematic Sampling
def sampling_technique4(X, y): return X.sample(n=int(0.5 * len(X)), random_state=4), y.sample(n=int(0.5 * len(y)), random_state=4)
def sampling_technique5(X, y): return X.sample(n=int(0.7 * len(X)), random_state=5), y.sample(n=int(0.7 * len(y)), random_state=5)

sampling_functions = [sampling_technique1, sampling_technique2, sampling_technique3, sampling_technique4, sampling_technique5]

# Define models
models = [
    LogisticRegression(max_iter=500),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    SVC(),
    KNeighborsClassifier()
]

# Train models and store accuracy results
results = np.zeros((5, 5))  # 5 models x 5 sampling techniques

for i, model in enumerate(models):
    for j, (sample_X, sample_y) in enumerate(zip(samples, labels)):
        X_train, X_test, y_train, y_test = train_test_split(sample_X, sample_y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[i, j] = accuracy_score(y_test, y_pred)

# Convert results to DataFrame
accuracy_df = pd.DataFrame(results, columns=["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"],
                            index=["M1", "M2", "M3", "M4", "M5"])
print(accuracy_df)

accuracy_df.to_csv('./result.csv')


Class
0    763
1      9
Name: count, dtype: int64
    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1   0.852459   0.967213   0.852459   0.885246   0.901639
M2   0.950820   1.000000   1.000000   1.000000   0.983607
M3   0.950820   0.918033   0.983607   0.983607   0.950820
M4   0.950820   0.983607   0.934426   0.967213   0.918033
M5   0.819672   0.901639   0.819672   0.901639   0.852459


In [4]:
# Determine best sampling technique for each model
best_sampling = accuracy_df.idxmax(axis=1)
print("\nBest sampling technique for each model:")
print(best_sampling)



Best sampling technique for each model:
M1    Sampling2
M2    Sampling3
M3    Sampling3
M4    Sampling2
M5    Sampling2
dtype: object
