# Imbalance Problem - Breast Cancer Data (Binary)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Breast cancer data
from sklearn.datasets import load_breast_cancer

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target
df = pd.DataFrame(data=X, columns=dataObj.feature_names)
df.insert(loc=0, column='class', value=y)

df['class'].value_counts().plot(kind='bar')

In [None]:
# Let make an imbalance dataset by sample small amount of class 0 (having cancer)
df0 = df[df['class'] == 0].sample(frac=0.10, random_state=1)
df1 = df[df['class'] == 1]
dfSample = pd.concat([df0, df1], axis=0, ignore_index=True)
dfSample['class'].value_counts().plot(kind='bar')

In [None]:
colsX = [i for i in dfSample.columns if i != 'class']
X = dfSample[colsX].values
y = dfSample['class'].values

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.20,
    stratify=y,
    random_state=1)

# Standardization
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Constructing a pipeline object
svc = SVC(random_state=1)

# Training
svc.fit(X_train_std, y_train)

# Prediction from test data
y_pred = svc.predict(X_test_std)

## Visualize confusion matrix

In [None]:
ConfusionMatrixDisplay.from_estimator(
    estimator=svc, X=X_test_std, y=y_test, display_labels=dataObj.target_names
)
plt.show()

## Calculate metrices

In [None]:
print(classification_report(y_test, y_pred, digits=4))

## Perform gridsearch on `recall`

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score


c_gamma_range = [0.01, 0.1, 1.0, 10.0]
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
set1 = {'C': param_range, 'kernel': ['linear']}
set2 = {'C': param_range, 'gamma': param_range, 'kernel': ['rbf']}
param_grid = [set1, set2]

# Making scorer wrapper so that we can pass the desired argument.
scorer = make_scorer(recall_score, pos_label=0)

# Grid search.
gs = GridSearchCV(estimator=svc,
                  param_grid=param_grid,
                  # Use scorer here
                  scoring=scorer,
                  cv=10,
                  n_jobs=-1)
                  
gs = gs.fit(X_train_std, y_train)
print(gs.best_score_)
print(gs.best_params_)

## Re-evaluate the metrices

In [None]:
y_pred = gs.predict(X_test_std)

print(classification_report(y_test, y_pred, digits=4))

## Use `class_weight` option

In [None]:
# Constructing a pipeline object
svc_balanced = SVC(random_state=1, class_weight='balanced')

# Training
svc_balanced.fit(X_train_std, y_train)

# Prediction from test data
y_pred = svc_balanced.predict(X_test_std)

In [None]:
print(classification_report(y_test, y_pred, digits=4))

## Use SMOTE 

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train_os, y_train_os = oversample.fit_resample(X_train_std, y_train)
pd.Series(y_train_os).value_counts()

# Training
svc.fit(X_train_os, y_train_os)
# Prediction from test data
y_pred = svc.predict(X_test_std)

In [None]:
print(classification_report(y_test, y_pred, digits=4))