## Assignment: 1 Sampling

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

Converting the imbalance Dataset into Balance Dataset with Oversampling method

In [41]:
import pandas as pd
from sklearn.utils import resample

url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)


majority_class = df[df['Class'] == 0]
minority_class = df[df['Class'] == 1]


balanced_minority = resample(minority_class,
                             replace=True,
                             n_samples=len(majority_class),
                             random_state=42)

balanced_df = pd.concat([majority_class, balanced_minority])

print(balanced_df['Class'].value_counts())


Class
0    763
1    763
Name: count, dtype: int64


## 1.  Simple Random Sampling

In [42]:
simple_random_sample = balanced_df.sample(n=500, random_state=42)


## 2. Stratified Sampling

In [43]:
from sklearn.model_selection import train_test_split

_, stratified_sample = train_test_split(balanced_df,
                                        test_size=500/len(balanced_df),
                                        stratify=balanced_df['Class'],
                                        random_state=42)


## 3. Systematic Sampling

In [44]:
import numpy as np

step = len(balanced_df) // 500
systematic_sample = balanced_df.iloc[::step, :]


## 4. Cluster Sampling

In [45]:
import numpy as np


bins = [0, 50, 100, 200, 500, np.inf]
labels = [0, 1, 2, 3, 4]
df['Cluster'] = pd.cut(df['Amount'], bins=bins, labels=labels)
print(df['Cluster'].value_counts())
np.random.seed(42)
df['Cluster'] = np.random.randint(0, 5, size=len(df))
print(df['Cluster'].value_counts())
selected_clusters = np.random.choice(df['Cluster'].unique(), size=2, replace=False)
cluster_sample = df[df['Cluster'].isin(selected_clusters)]

print(cluster_sample.head())




Cluster
0    560
1     99
2     51
3     34
4     23
Name: count, dtype: int64
Cluster
0    163
3    162
4    160
1    146
2    141
Name: count, dtype: int64
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
6     4  1.229658  0.141004  0.045371  1.202613  0.191881  0.272708 -0.005159   
7     7 -0.644269  1.417964  1.074380 -0.492199  0.948934  0.428118  1.120631   
8     7 -0.894286  0.286157 -0.113192 -0.271526  2.669599  3.721818  0.370145   

         V8        V9  ...       V22       V23       V24       V25       V26  \
0  0.098698  0.363787  ...  0.277838 -0.110474  0.066928  0.128539 -0.189115   
2  0.247676 -1.514654  ...  0.771679  0.909412 -0.689281 -0.327642 -0.139097   
6  0.081213  0.464960  ... -0.270710 -0.154104 -0.780055  0.750137 -0.257237   
7 -3.807864  0.615375  ... -1.01545

## 5. Bootstrap Sampling

In [46]:
bootstrap_sample = balanced_df.sample(n=500, replace=True, random_state=42)


##  Evaluate Sampling Techniques on Diverse ML Models

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='rbf', random_state=42)
}

sampling_methods = {
    "Simple Random": simple_random_sample,
    "Stratified": stratified_sample,
    "Systematic": systematic_sample,
    "Cluster": cluster_sample,
    "Bootstrap": bootstrap_sample
}


results = {}


for sampling_name, sample in sampling_methods.items():
    X = sample.drop(columns=['Class'])
    y = sample['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[(sampling_name, model_name)] = acc


for (sampling, model), accuracy in results.items():
    print(f"Sampling: {sampling}, Model: {model}, Accuracy: {accuracy:.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Sampling: Simple Random, Model: Logistic Regression, Accuracy: 0.50
Sampling: Simple Random, Model: Decision Tree, Accuracy: 0.50
Sampling: Simple Random, Model: Random Forest, Accuracy: 0.50
Sampling: Simple Random, Model: K-Nearest Neighbors, Accuracy: 0.50
Sampling: Simple Random, Model: Support Vector Machine, Accuracy: 0.75
Sampling: Stratified, Model: Logistic Regression, Accuracy: 0.93
Sampling: Stratified, Model: Decision Tree, Accuracy: 0.98
Sampling: Stratified, Model: Random Forest, Accuracy: 0.99
Sampling: Stratified, Model: K-Nearest Neighbors, Accuracy: 0.94
Sampling: Stratified, Model: Support Vector Machine, Accuracy: 0.70
Sampling: Systematic, Model: Logistic Regression, Accuracy: 0.95
Sampling: Systematic, Model: Decision Tree, Accuracy: 0.99
Sampling: Systematic, Model: Random Forest, Accuracy: 1.00
Sampling: Systematic, Model: K-Nearest Neighbors, Accuracy: 0.95
Sampling: Systematic, Model: Support Vector Machine, Accuracy: 0.64
Sampling: Cluster, Model: Logistic Re

In [55]:
!pip install tabulate
from tabulate import tabulate
table_data = [["Sampling Method"] + list(models.keys())]
for sampling in sampling_methods.keys():
    row = [sampling]
    for model in models.keys():
        row.append(results.get((sampling, model), "N/A"))
    table_data.append(row)


print(tabulate(table_data, headers="firstrow", tablefmt="grid"))


+-------------------+-----------------------+-----------------+-----------------+-----------------------+--------------------------+
| Sampling Method   |   Logistic Regression |   Decision Tree |   Random Forest |   K-Nearest Neighbors |   Support Vector Machine |
| Simple Random     |              0.5      |        0.5      |        0.5      |              0.5      |                 0.75     |
+-------------------+-----------------------+-----------------+-----------------+-----------------------+--------------------------+
| Stratified        |              0.930693 |        0.980198 |        0.990099 |              0.940594 |                 0.70297  |
+-------------------+-----------------------+-----------------+-----------------+-----------------------+--------------------------+
| Systematic        |              0.95098  |        0.990196 |        1        |              0.95098  |                 0.637255 |
+-------------------+-----------------------+-----------------+------

In [65]:
## Undersampling


majority_class = df[df['Class'] == 0]
minority_class = df[df['Class'] == 1]


from sklearn.utils import resample

undersampled_majority = resample(
    majority_class,
    replace=False,
    n_samples=len(minority_class),
    random_state=42
)


undersampled_df = pd.concat([undersampled_majority, minority_class])

print(undersampled_df['Class'].value_counts())


Class
0    9
1    9
Name: count, dtype: int64


In [56]:
print(len(undersampled_df))


18


In [66]:
# Simple Random Sampling

sample_size = min(100, len(undersampled_df))
simple_random_sample = undersampled_df.sample(n=sample_size, random_state=42)


# Stratified Sampling
from sklearn.model_selection import train_test_split

dataset_size = len(undersampled_df)

if dataset_size > 100:
    test_size = 100 / dataset_size
else:
    test_size = max(1, dataset_size - 1)


try:
    _, stratified_sample = train_test_split(
        undersampled_df,
        test_size=test_size,
        stratify=undersampled_df['Class'],
        random_state=42
    )
    print(f"Stratified Sample Size: {len(stratified_sample)}")
except ValueError as e:
    print(f"Error during stratified sampling: {e}")

print(f"Stratified Sample Size: {len(stratified_sample)}")
dataset_size = len(undersampled_df)
step = max(1, dataset_size // 100)


try:
    systematic_sample = undersampled_df.iloc[::step, :]
    print(f"Systematic Sample Size: {len(systematic_sample)}")
except Exception as e:
    print(f"Error during systematic sampling: {e}")


import numpy as np
np.random.seed(42)
undersampled_df['Cluster'] = np.random.randint(0, 5, size=len(undersampled_df))
selected_clusters = np.random.choice(undersampled_df['Cluster'].unique(), size=2, replace=False)
cluster_sample = undersampled_df[undersampled_df['Cluster'].isin(selected_clusters)]


bootstrap_sample = undersampled_df.sample(n=100, replace=True, random_state=42)


sampling_methods = {
    "Simple Random": simple_random_sample,
    "Stratified": stratified_sample,
    "Systematic": systematic_sample,
    "Cluster": cluster_sample,
    "Bootstrap": bootstrap_sample
}


Error during stratified sampling: The train_size = 1 should be greater or equal to the number of classes = 2
Stratified Sample Size: 501
Systematic Sample Size: 18


In [67]:


sampling_methods = {
    "Simple Random": simple_random_sample,
    "Stratified": stratified_sample,
    "Systematic": systematic_sample,
    "Cluster": cluster_sample,
    "Bootstrap": bootstrap_sample
}
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='rbf', random_state=42)
}


results = {}


for sampling_name, sample in sampling_methods.items():
    X = sample.drop(columns=['Class', 'Cluster'], errors='ignore')
    y = sample['Class']

    for model_name, model in models.items():
        model.fit(X, y)
        y_pred = model.predict(X)
        acc = accuracy_score(y, y_pred)
        results[(sampling_name, model_name)] = acc
import pandas as pd
from tabulate import tabulate

results_df = pd.DataFrame(
    [
        {"Sampling Method": sampling, "Model": model, "Accuracy": accuracy}
        for (sampling, model), accuracy in results.items()
    ]
)

pivot_results = results_df.pivot(
    index="Sampling Method", columns="Model", values="Accuracy"
)
print(tabulate(pivot_results, headers="keys", tablefmt="grid"))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


+-------------------+-----------------+-----------------------+-----------------------+-----------------+--------------------------+
| Sampling Method   |   Decision Tree |   K-Nearest Neighbors |   Logistic Regression |   Random Forest |   Support Vector Machine |
| Bootstrap         |               1 |              0.98     |               1       |               1 |                 0.65     |
+-------------------+-----------------+-----------------------+-----------------------+-----------------+--------------------------+
| Cluster           |               1 |              0.625    |               1       |               1 |                 0.75     |
+-------------------+-----------------+-----------------------+-----------------------+-----------------+--------------------------+
| Simple Random     |               1 |              0.611111 |               1       |               1 |                 0.666667 |
+-------------------+-----------------+-----------------------+------