# Imbalance Problem - Glass Data (Multiclass)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
df = pd.read_csv('glass.csv')
df['class'].value_counts().sort_index(ascending=False).plot(kind='barh')

In [None]:
# Make labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])
df['class'].value_counts().sort_index(ascending=False).plot(kind='barh')

In [None]:
colsX = [i for i in df.columns if i != 'class']
X = df[colsX].values
y = df['class'].values

# Splitting data
X_train_std, X_test_std, y_train, y_test = train_test_split(X, y,
    test_size=0.20,
    stratify=y,
    random_state=1)

# Standardizing
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train_std)
X_test_std = sc.transform(X_test_std)

# Constructing a pipeline object
svc = SVC(random_state=1)

# Training
svc.fit(X_train_std, y_train)

# Prediction from test data
y_pred = svc.predict(X_test_std)

## Visualize confusion matrices

In [None]:
ConfusionMatrixDisplay.from_estimator(
    estimator=svc, X=X_test_std, y=y_test
)
plt.show()

In [None]:
import seaborn as sns
from sklearn.metrics import multilabel_confusion_matrix
#Multilabel confusion matrix
matrices = multilabel_confusion_matrix(y_true=y_test, y_pred=y_pred)

nc = np.unique(y).shape[0]
fig, axs = plt.subplots(nc, 1, figsize=(5,nc*4))
for idx, m in enumerate(matrices):
    sns.heatmap(np.flip(m), annot=True, cmap='Blues', ax=axs[idx])
    axs[idx].set_title(f'Class {idx}')

# Calculate metrices

In [None]:
print(classification_report(y_test, y_pred,  digits=4))

In [None]:
from sklearn.metrics import precision_recall_fscore_support

arr = []
for average in ['macro', 'weighted', 'micro']:
    prfs = precision_recall_fscore_support(y_test, y_pred, average=average)
    data = {'average': average, 'precision': prfs[0], "recall": prfs[1], "f1": prfs[2] }
    arr.append(data)

dft = pd.DataFrame.from_records(arr, index='average')
display(dft)

## Perform gridsearch on the selected metrice

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score

c_gamma_range = [0.01, 0.1, 1.0, 10.0]
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
set1 = {'C': param_range, 'kernel': ['linear']}
set2 = {'C': param_range, 'gamma': param_range, 'kernel': ['rbf']}
param_grid = [set1, set2]

# Making scorer wrapper so that we can pass the desired argument.
scorer = make_scorer(recall_score, average='micro')

# Grid search.
gs = GridSearchCV(estimator=svc,
                  param_grid=param_grid,
                  # Use scorer here
                  scoring=scorer,
                  cv=5,
                  n_jobs=-1)
                  
gs = gs.fit(X_train_std, y_train)
print(gs.best_score_)
print(gs.best_params_)

## Re-evaluate metrices

In [None]:
print(classification_report(y_test, y_pred,  digits=4))

## Use the `class weight` setting

In [None]:
# Constructing a pipeline object
svc_balanced = SVC(random_state=1, class_weight='balanced')
# Training
svc_balanced.fit(X_train_std, y_train)

# Prediction from test data
y_pred = svc_balanced.predict(X_test_std)

In [None]:
ConfusionMatrixDisplay.from_estimator(
    estimator=svc, X=X_test_std, y=y_test
)
plt.show()

In [None]:
print(classification_report(y_test, y_pred, digits=4))

## Use SMOTE 

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train_os, y_train_os = oversample.fit_resample(X_train_std, y_train)
pd.Series(y_train_os).value_counts()

# Training
svc.fit(X_train_os, y_train_os)
# Prediction from test data
y_pred = svc.predict(X_test_std)

In [None]:
ConfusionMatrixDisplay.from_estimator(
    estimator=svc, X=X_test_std, y=y_test
)
plt.show()

In [None]:
print(classification_report(y_test, y_pred, digits=4))