<a href="https://colab.research.google.com/github/jyotisolanki03/Sampling/blob/main/Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


data = pd.read_csv('/content/Creditcard_data.csv')
print("Original Dataset Class Distribution:")
print(data['Class'].value_counts())


X = data.drop('Class', axis=1)
y = data['Class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE (Oversampling Minority Class):")
print(pd.Series(y_smote).value_counts())


balanced_data = pd.concat([pd.DataFrame(X_smote), pd.DataFrame(y_smote, columns=['Class'])], axis=1)
balanced_data.to_csv('balanced_credit_card_dataset.csv',index=False)

Original Dataset Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64

After SMOTE (Oversampling Minority Class):
Class
0    534
1    534
Name: count, dtype: int64


In [2]:
import pandas as pd
from sklearn.utils import resample


data = pd.read_csv('/content/balanced_credit_card_dataset.csv')
print("Dataset Shape:", data.shape)

# Define sample size detection formula
def calculate_sample_size(population_size, margin_of_error=0.05, confidence_level=0.95):
    from math import ceil
    import scipy.stats as stats
    # Z-score for the confidence level
    z_score = stats.norm.ppf((1 + confidence_level) / 2)
    p = 0.5
    numerator = (z_score ** 2) * p * (1 - p)
    denominator = (margin_of_error ** 2)
    sample_size = numerator / denominator
    corrected_sample_size = sample_size / (1 + ((sample_size - 1) / population_size))
    return ceil(corrected_sample_size)


population_size = len(data)
print("Population Size:", population_size)


sample_sizes = []
for i in range(1, 6):
    margin_of_error = 0.05 * i
    sample_size = calculate_sample_size(population_size, margin_of_error)
    sample_sizes.append(sample_size)

print("\nCalculated Sample Sizes:", sample_sizes)


samples = []
for i, size in enumerate(sample_sizes):
    sample = resample(data, n_samples=size, stratify=data['Class'], random_state=i)
    samples.append(sample)

    sample.to_csv(f'sample_{i+1}.csv', index=False)
    print(f"Sample {i+1} created with size {size} and saved to 'sample_{i+1}.csv'.")


for i, sample in enumerate(samples):
    print(f"\nSample {i+1} Class Distribution:")
    print(sample['Class'].value_counts())


Dataset Shape: (1068, 31)
Population Size: 1068

Calculated Sample Sizes: [283, 89, 42, 24, 16]
Sample 1 created with size 283 and saved to 'sample_1.csv'.
Sample 2 created with size 89 and saved to 'sample_2.csv'.
Sample 3 created with size 42 and saved to 'sample_3.csv'.
Sample 4 created with size 24 and saved to 'sample_4.csv'.
Sample 5 created with size 16 and saved to 'sample_5.csv'.

Sample 1 Class Distribution:
Class
1    142
0    141
Name: count, dtype: int64

Sample 2 Class Distribution:
Class
0    45
1    44
Name: count, dtype: int64

Sample 3 Class Distribution:
Class
0    21
1    21
Name: count, dtype: int64

Sample 4 Class Distribution:
Class
0    12
1    12
Name: count, dtype: int64

Sample 5 Class Distribution:
Class
0    8
1    8
Name: count, dtype: int64


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


samples = [pd.read_csv(f'/content/sample_{i+1}.csv') for i in range(5)]


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}


results = {}


for sample_idx, sample in enumerate(samples):


    X = sample.drop(columns=['Class'])
    y = sample['Class']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    sample_results = {}

    for model_name, model in models.items():

        model.fit(X_train, y_train)


        y_pred = model.predict(X_test)


        accuracy = accuracy_score(y_test, y_pred)
        sample_results[model_name] = accuracy

    results[f"Sample {sample_idx+1}"] = sample_results

# Display results for all samples and models
print("\n=== Summary of Results ===")
for sample_name, sample_results in results.items():
    print(f"\n{sample_name}:")
    for model_name, accuracy in sample_results.items():
        print(f"{model_name}: {accuracy:.2f}")



=== Summary of Results ===

Sample 1:
Logistic Regression: 0.99
Decision Tree: 0.96
Random Forest: 1.00
SVM: 0.64
KNN: 0.76

Sample 2:
Logistic Regression: 0.93
Decision Tree: 0.74
Random Forest: 0.93
SVM: 0.59
KNN: 0.74

Sample 3:
Logistic Regression: 0.92
Decision Tree: 0.85
Random Forest: 0.92
SVM: 0.46
KNN: 0.38

Sample 4:
Logistic Regression: 0.75
Decision Tree: 0.75
Random Forest: 0.62
SVM: 0.62
KNN: 0.62

Sample 5:
Logistic Regression: 0.80
Decision Tree: 0.80
Random Forest: 0.80
SVM: 0.60
KNN: 0.60
