# SAMPLING METHODS

In [1]:
import numpy as np

# Data, Spliting and Metrics
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.metrics import accuracy_score

# Classification, Pipeline, and Sampling
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


# Data

In [2]:
data = load_breast_cancer()
x = data.data
y = data.target
values, counts = np.unique(y, return_counts=True)
print("Data Distribution")
print("Class -  Counts  -    Percentage")
for i in range(len(values)):
    print(f"{values[i]}     -   {counts[i]}    -    {round(counts[i]/sum(counts)*100, 2)}%")

Data Distribution
Class -  Counts  -    Percentage
0     -   212    -    37.26%
1     -   357    -    62.74%


# Splitting

In [3]:
# cross validation, preserves ration of dataset,see distribution above
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=1)
for train_index, test_index in cv.split(x,y):    
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
print(f"train:{round(len(x_train)/len(x),2)*100}%, test:{round(len(x_test)/len(x),2)*100}%")       

train:80.0%, test:20.0%


# Sampling and Classification Model

In [4]:
def hyper_param_tuning(model, pipeline):
    param_grid = {'C': [0.1, 1, 10, 100, 1000], # regularization
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient
                'kernel': ['rbf']} # radial basis function
    pipe_param = {'svc__' + key: param_grid[key] for key in param_grid}
    grid = GridSearchCV(pipeline, param_grid=pipe_param, refit=True, cv=cv, scoring='accuracy', verbose=1)
    grid.fit(x_test, y_test)
    return grid
    
def sampling_pipeline(model, sampling_mode, param_tune=False, sampling_ratio=[0.5,0.5]):
    print(f"SAMPLING: {' '.join(str(sampling_mode).split('_')).upper()} \n")
    model_name = str(model)[:-2]
    # sampling mode
    pipelines = {
                "none": make_pipeline(model),
                "over_sampling":make_pipeline(
                            SMOTE(random_state=1, sampling_strategy='auto'), 
                            model), 
                "under_sampling": make_pipeline(
                            RandomUnderSampler(random_state=1, sampling_strategy='auto'), 
                            model),
                "mixed_sampling" : make_pipeline(
                            SMOTE(random_state=1, sampling_strategy=sampling_ratio[0]), 
                            RandomUnderSampler(random_state=1, sampling_strategy=sampling_ratio[1]), 
                            model)
                }

    if param_tune == False: # Vanilla Classification Model 
        # cross validation
        scores = cross_val_score(pipelines[sampling_mode], x_train, y_train, scoring='accuracy', cv=cv)
        print(f'Cross Validation Accuracy: {round(np.mean(scores),4)}, ({ round(np.std(scores),3)})')
        # model
        model = pipelines[sampling_mode]
        model.fit(x_test, y_test)     
    else: # Model Hyperparameter Tuning
        print('Fitting Hyperparameter Tuning...')
        model = hyper_param_tuning(model, pipelines[sampling_mode])
        print(f"\tBest {model_name} parameter is: {model.best_params_}")
        # Mean cross-validated score of the best_estimator
        print(f"\nCross Validation Score : {model.best_score_}")

    # prediction and metrics
    hist = model.predict(x_test)
    print("Test Metrics:")
    print(classification_report(y_test, hist))


# Sampling Methods

In [5]:
sampling_pipeline(model=SVC(), sampling_mode="none")

SAMPLING: NONE 

Cross Validation Accuracy: 0.9102, (0.018)
Test Metrics:
              precision    recall  f1-score   support

           0       0.97      0.79      0.87        42
           1       0.89      0.99      0.93        71

    accuracy                           0.91       113
   macro avg       0.93      0.89      0.90       113
weighted avg       0.92      0.91      0.91       113



In [6]:
sampling_pipeline(model=SVC(), sampling_mode="over_sampling")

SAMPLING: OVER SAMPLING 

Cross Validation Accuracy: 0.9013, (0.012)
Test Metrics:
              precision    recall  f1-score   support

           0       0.97      0.81      0.88        42
           1       0.90      0.99      0.94        71

    accuracy                           0.92       113
   macro avg       0.93      0.90      0.91       113
weighted avg       0.92      0.92      0.92       113



In [7]:
sampling_pipeline(model=SVC(), sampling_mode="under_sampling")

SAMPLING: UNDER SAMPLING 

Cross Validation Accuracy: 0.9036, (0.012)
Test Metrics:
              precision    recall  f1-score   support

           0       0.97      0.81      0.88        42
           1       0.90      0.99      0.94        71

    accuracy                           0.92       113
   macro avg       0.93      0.90      0.91       113
weighted avg       0.92      0.92      0.92       113



In [8]:
# for mixed sampling ratio
d_diff = ((counts[1] - counts[0])/2) + counts[0]
min_ratio = round(d_diff/counts[1],2)
maj_ratio = round(2-(d_diff/counts[0]),2)
print("Under Sampling Ratio for Majority : ", min_ratio)
print("Over Sampling Ratio for Minority  : ", maj_ratio)

Under Sampling Ratio for Majority :  0.8
Over Sampling Ratio for Minority  :  0.66


In [9]:
sampling_pipeline(model=SVC(), sampling_mode="mixed_sampling", sampling_ratio=[maj_ratio, min_ratio])

SAMPLING: MIXED SAMPLING 

Cross Validation Accuracy: 0.9211, (0.021)
Test Metrics:
              precision    recall  f1-score   support

           0       0.97      0.81      0.88        42
           1       0.90      0.99      0.94        71

    accuracy                           0.92       113
   macro avg       0.93      0.90      0.91       113
weighted avg       0.92      0.92      0.92       113



# Conclusion

Above, we see that the accuracy of:
- No Sampling = 91%
- Over Sampling = 92%
- Under Sampling = 92%
- Mixed Sampling = 92%

We can see that we will have a better classification accuracy, if any **sampling methods is used**, improving accuracy by **1%**.


When we compare the sampling methods' metrics, we see that **all** method have an **identical metric scores**.




# Sampling + Hyperparameter Tuning

In [10]:
sampling_pipeline(model=SVC(), sampling_mode="none", param_tune=True)

SAMPLING: NONE 

Fitting Hyperparameter Tuning...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
	Best SVC parameter is: {'svc__C': 1, 'svc__gamma': 0.0001, 'svc__kernel': 'rbf'}

Cross Validation Score : 0.9557312252964427
Test Metrics:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.97      1.00      0.99        71

    accuracy                           0.98       113
   macro avg       0.99      0.98      0.98       113
weighted avg       0.98      0.98      0.98       113



In [11]:
sampling_pipeline(model=SVC(), sampling_mode="over_sampling", param_tune=True)

SAMPLING: OVER SAMPLING 

Fitting Hyperparameter Tuning...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
	Best SVC parameter is: {'svc__C': 1, 'svc__gamma': 0.0001, 'svc__kernel': 'rbf'}

Cross Validation Score : 0.9466403162055336
Test Metrics:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        71

    accuracy                           0.98       113
   macro avg       0.98      0.98      0.98       113
weighted avg       0.98      0.98      0.98       113



In [12]:
sampling_pipeline(model=SVC(), sampling_mode="under_sampling", param_tune=True)

SAMPLING: UNDER SAMPLING 

Fitting Hyperparameter Tuning...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
	Best SVC parameter is: {'svc__C': 1, 'svc__gamma': 0.0001, 'svc__kernel': 'rbf'}

Cross Validation Score : 0.9466403162055336
Test Metrics:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96        42
           1       0.99      0.97      0.98        71

    accuracy                           0.97       113
   macro avg       0.97      0.97      0.97       113
weighted avg       0.97      0.97      0.97       113



In [13]:
sampling_pipeline(model=SVC(), sampling_mode="mixed_sampling", sampling_ratio=[maj_ratio, min_ratio],param_tune=True)

SAMPLING: MIXED SAMPLING 

Fitting Hyperparameter Tuning...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
	Best SVC parameter is: {'svc__C': 1, 'svc__gamma': 0.0001, 'svc__kernel': 'rbf'}

Cross Validation Score : 0.9557312252964427
Test Metrics:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        71

    accuracy                           0.98       113
   macro avg       0.98      0.98      0.98       113
weighted avg       0.98      0.98      0.98       113



# Conclusion

Above, we see that the accuracy of:
- No Sampling = 98%
- Over Sampling = 98%
- Under Sampling = 97%
- Mixed Sampling = 98%

We can see that **undersampling** degraded the accuracy rate by **-1%**



