In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats


df = pd.read_csv('/content/Creditcard_data.csv')

## Sampling:
1.  Oversampling:
    -   Random Oversampling: balance the class distribution by randomly duplicating the instances of the minority class
    -   SMOTE: Generates synthetic samples by focusing on the minority samples
2.  Undersampling:
    -   Random Undersampling: balance the class distribution by randomly duplicating the instances of the majority class
    -   Tomek Links: Identifies and remove instances that are considered ambiguous or near the decision boundary between classes
    -   Near miss: Identifies instances which are close to instances from minority class, aiming to retain the instances that are more difficult to classify correctly
###
**We will use imbalanced-learn library**

In [2]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

## Evaluating each sampling technique on different models:
1. **RandomForest Classifier**
2. **XGBoost Classifier**
3. **DecisionTree**
4. **Support Vector Classifiers (SVC)**
5. **LogisticRegression**

In [4]:
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X = df.drop(columns = 'Class')
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [5]:
models = {
   'DecisionTree': DecisionTreeRegressor(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, verbosity=0),
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVC': SVC(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42)
}

samplers = {
    'RandomOverSampler': RandomOverSampler(sampling_strategy='auto', random_state=42),
    'SMOTE': SMOTE(sampling_strategy='auto', random_state=42),
    'RandomUnderSampler': RandomUnderSampler(sampling_strategy='auto', random_state=42),
    'NearMiss': NearMiss(sampling_strategy='auto', version=1),
    'TomekLinks': TomekLinks(sampling_strategy='auto')
}





In [6]:
results_table = pd.DataFrame(columns = ['Model', 'Sampler', 'Accuracy'])

for model_name, model in models.items():
    for sampler_name, sampler in samplers.items():

        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        model.fit(X_resampled, y_resampled)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results_table = results_table.append({
            'Model':model_name,
            'Sampler':sampler_name,
            'Accuracy':accuracy,
            # 'Precision':precision,
            # 'Recall':recall,
            # 'F1 Score':f1
        }, ignore_index = True)

  results_table = results_table.append({
  results_table = results_table.append({
  results_table = results_table.append({
  results_table = results_table.append({
  results_table = results_table.append({
  _warn_prf(average, modifier, msg_start, len(result))
  results_table = results_table.append({
  results_table = results_table.append({
  results_table = results_table.append({
  results_table = results_table.append({
  _warn_prf(average, modifier, msg_start, len(result))
  results_table = results_table.append({
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  results_table = results_table.append({
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the numbe

In [7]:
print(results_table)

                 Model             Sampler  Accuracy
0         DecisionTree   RandomOverSampler  0.987097
1         DecisionTree               SMOTE  0.941935
2         DecisionTree  RandomUnderSampler  0.574194
3         DecisionTree            NearMiss  0.077419
4         DecisionTree          TomekLinks  0.974194
5              XGBoost   RandomOverSampler  0.993548
6              XGBoost               SMOTE  0.980645
7              XGBoost  RandomUnderSampler  0.600000
8              XGBoost            NearMiss  0.096774
9              XGBoost          TomekLinks  0.993548
10  LogisticRegression   RandomOverSampler  0.877419
11  LogisticRegression               SMOTE  0.877419
12  LogisticRegression  RandomUnderSampler  0.593548
13  LogisticRegression            NearMiss  0.432258
14  LogisticRegression          TomekLinks  0.993548
15                 SVC   RandomOverSampler  0.696774
16                 SVC               SMOTE  0.670968
17                 SVC  RandomUnderSampler  0.

In [8]:
pivoted_results = results_table.pivot(index='Model', columns='Sampler', values='Accuracy')


In [9]:
print(pivoted_results)

Sampler             NearMiss  RandomOverSampler  RandomUnderSampler     SMOTE  \
Model                                                                           
DecisionTree        0.077419           0.987097            0.574194  0.941935   
LogisticRegression  0.432258           0.877419            0.593548  0.877419   
RandomForest        0.393548           0.993548            0.709677  0.993548   
SVC                 0.348387           0.696774            0.625806  0.670968   
XGBoost             0.096774           0.993548            0.600000  0.980645   

Sampler             TomekLinks  
Model                           
DecisionTree          0.974194  
LogisticRegression    0.993548  
RandomForest          0.993548  
SVC                   0.993548  
XGBoost               0.993548  
