In [15]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [16]:
from sklearn.svm import SVC

In [17]:
# Download the dataset
url = 'https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv'
df = pd.read_csv(url)

In [18]:
# Count the number of samples in each class
class_counts = df["Class"].value_counts()

In [19]:
# Determine the majority and minority class
majority_class = class_counts.idxmax()
minority_class = class_counts.idxmin()


In [20]:
# Split the dataset into features and target variable
X = df.drop("Class", axis=1)
y = df["Class"]

In [21]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Create a list of five different sampling techniques
samplings = [
    RandomOverSampler(random_state=42),
    SMOTE(random_state=42),
    ADASYN(random_state=42),
    RandomUnderSampler(random_state=42),
    TomekLinks()
]

In [23]:
# Create a list of five different machine learning models
models = [
    DecisionTreeClassifier(random_state=42),
    GaussianNB(),
    KNeighborsClassifier(n_neighbors=7),
    LogisticRegression(random_state=42),
    SVC(random_state=42)
]

In [24]:
# Loop through each sampling technique and each machine learning model
for i, sampling in enumerate(samplings):
    for j, model in enumerate(models):
        # Apply the sampling technique to the training set
        X_train_resampled, y_train_resampled = sampling.fit_resample(X_train, y_train)

        # Train the machine learning model on the resampled training set
        model.fit(X_train_resampled, y_train_resampled)

        # Make predictions on the testing set
        y_pred = model.predict(X_test)

        # Compute the evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Print the evaluation metrics
        print(f"Sampling{i+1} - Model{j+1}:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1:.4f}")

Sampling1 - Model1:
  Accuracy: 0.9871
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling1 - Model2:
  Accuracy: 0.9355
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling1 - Model3:
  Accuracy: 0.9677
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling1 - Model4:
  Accuracy: 0.8774
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Sampling1 - Model5:
  Accuracy: 0.6968
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling2 - Model1:
  Accuracy: 0.9419
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling2 - Model2:
  Accuracy: 0.8581
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling2 - Model3:
  Accuracy: 0.7032
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling2 - Model4:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Accuracy: 0.8774
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling2 - Model5:
  Accuracy: 0.6710
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling3 - Model1:
  Accuracy: 0.9548
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling3 - Model2:
  Accuracy: 0.8645
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling3 - Model3:
  Accuracy: 0.6968
  Precision: 0.0208
  Recall: 1.0000
  F1 Score: 0.0408
Sampling3 - Model4:
  Accuracy: 0.8710
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Sampling3 - Model5:
  Accuracy: 0.6968
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling4 - Model1:
  Accuracy: 0.5742
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling4 - Model2:
  Accuracy: 0.6968
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling4 - Model3:
  Accuracy: 0.6452
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling4 - Model4:
  Accuracy: 0.5935
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling4 - Model5:
  Accuracy: 0.6258
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling5 - Model1:
  Accuracy: 0.9742
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Sampling5 - Model2:
  Accuracy: 0.9806
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling5 - Model3:
  Accuracy: 0.9935
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling5 - Model4:
  Accuracy: 0.9935
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
Sampling5 - Model5:
  Accuracy: 0.9935
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
samplings = [
    "RandomOverSampler",
    "SMOTE",
    "ADASYN",
    "RandomUnderSampler",
    "TomekLinks"
]

models = [
    "DecisionTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "LogisticRegression",
    "SVC"
]

# Create an empty dictionary to store the accuracy scores
accuracy_dict = {}

In [26]:
# Loop through each sampling technique and each machine learning model
for i, sampling in enumerate(samplings):
    accuracy_dict[sampling] = {}
    for j, model in enumerate(models):
        # Import the model class and create an instance of the model
        ModelClass = globals()[model]
        model_instance = ModelClass()

        # Import the sampling technique class and create an instance of the technique
        SamplingClass = globals()[sampling]
        sampling_instance = SamplingClass()

        # Apply the sampling technique to the training set
        X_train_resampled, y_train_resampled = sampling_instance.fit_resample(X_train, y_train)

        # Train the machine learning model on the resampled training set
        model_instance.fit(X_train_resampled, y_train_resampled)

        # Make predictions on the testing set
        y_pred = model_instance.predict(X_test)

        # Compute the accuracy score
        accuracy = accuracy_score(y_test, y_pred)

        # Add the accuracy score to the dictionary
        accuracy_dict[sampling][model] = accuracy

# Convert the accuracy dictionary to a DataFrame and print the table
accuracy_df = pd.DataFrame(accuracy_dict)
print(accuracy_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                        RandomOverSampler     SMOTE    ADASYN  \
DecisionTreeClassifier           0.987097  0.954839  0.974194   
GaussianNB                       0.948387  0.883871  0.851613   
KNeighborsClassifier             0.980645  0.761290  0.748387   
LogisticRegression               0.845161  0.864516  0.870968   
SVC                              0.703226  0.683871  0.703226   

                        RandomUnderSampler  TomekLinks  
DecisionTreeClassifier            0.741935    0.974194  
GaussianNB                        0.548387    0.980645  
KNeighborsClassifier              0.696774    0.993548  
LogisticRegression                0.503226    0.993548  
SVC                               0.651613    0.993548  


In [27]:
# Find the sampling technique with the highest accuracy score
best_sampling = max(accuracy_dict, key=lambda x: max(accuracy_dict[x].values()))

# Find the model with the highest accuracy score for the best sampling technique
best_model = max(accuracy_dict[best_sampling], key=accuracy_dict[best_sampling].get)

# Print the best sampling technique and best model
print("Best sampling technique is {} Over model {}".format(best_sampling,best_model))

Best sampling technique is TomekLinks Over model KNeighborsClassifier
