<a href="https://colab.research.google.com/github/harnalashok/classification/blob/main/performance_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## PEr

In [None]:
"""
conda create -n sklearn -c anaconda -c conda-forge python=3.7.6 scikit-learn=0.23.2 pandas ipython numpy spyder imbalanced-learn matplotlib

Last amended: 27th August, 2020
Myfolder: /home/ashok/Documents/10.higgsBoson
Objective:
            i)   Quick modeling with multiple models
            ii)  Learn performance measures: ROC, AUC, confusion_matrix
            iii) ROC curve
            iv)  To display uniformity of coding in sklearn

"""

# Import libraries
# 1.0
%reset -f
# 1.1
import numpy as np

# 1.2 For modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE, ADASYN,BorderlineSMOTE,SVMSMOTE
from sklearn.decomposition import PCA

# 1.3 For generating dataset
from sklearn.datasets import make_classification

# 1.4 For performance measures
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix

# 1.5 Plotting metrics related graphs
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve

# 1.6 For data splitting
from sklearn.model_selection import train_test_split

# 2.0 Generate arrays of data for classification:

X,y = make_classification(
                           n_samples=10000,
                           n_features=20,
                           n_redundant=0,
                           weights=[0.99]
                         )

X.shape        # (10000, 20)
y
np.sum(y)      # 147 ;  (10000-147)/10000 = 0.985

# 2.1 Perform PCA to remove any noisy columns from data:
pca = PCA(n_components=0.95, svd_solver = 'full')
X_new = pca.fit_transform(X)
X_new.shape  # (10000, 19)

# 2.2 Split, shuffle and perform stratified sampling:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    shuffle = True,
                                                    stratify = y
                                                    )

# 2.2.1
X_train.shape         # (7000, 20)
y_train.shape         # (7000,)
np.sum(y_train)       # 103

# 3.0 Perform data balancing:
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
X_resampled.shape      # (13794, 20)
#X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
#X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X_train, y_train)
#X_resampled, y_resampled = SVMSMOTE().fit_resample(X_train,y_train)

# 3.1 Create an instance of Gradient Boosting classifier
clf_gbm = GradientBoostingClassifier(
                                     learning_rate = 0.05,
                                     n_estimators = 500
                                     )

# 3.2 Train classifier on resampled data
clf_gbm.fit(X_resampled,y_resampled)
# 3.3 MAke predictions
y_pred_gbm= clf_gbm.predict(X_test)
# 3.4 Performance measures
recall_score(y_test,y_pred_gbm,pos_label = 1)     # 0.6590909090909091
precision_score(y_test,y_pred_gbm,pos_label = 1)  # 0.29
# 3.5
f1_score(y_test,y_pred_gbm, pos_label =1)         # 0.40277777777777773
# 3.6
confusion_matrix(y_test,y_pred_gbm)

# 4.0 Train gbm classifier on original data:
clf1_gbm = GradientBoostingClassifier(
                                       learning_rate = 0.05,
                                       n_estimators = 500
                                      )
# 4.1
clf1_gbm.fit(X_train,y_train)
# 4.2
y1_pred_gbm= clf_gbm.predict(X_test)
# 4.3
f1_score(y_test,y1_pred_gbm, pos_label =1)         #  0.40277777777777773
recall_score(y_test,y1_pred_gbm,pos_label = 1)     #  0.6590909090909091
precision_score(y_test,y1_pred_gbm,pos_label = 1)  # 0.29
confusion_matrix(y_test,y1_pred_gbm)


# 5.0 Using RandomForestClassifier on resampled data:
clf_rf = RandomForestClassifier(n_estimators = 300)
clf_rf.fit(X_resampled,y_resampled)
y_pred_rf=clf_rf.predict(X_test)
f1_score(y_test,y_pred_rf, pos_label =1)  # 0.5977011494252873
recall_score(y_test,y_pred_rf,pos_label = 1)  # 0.5909090909090909
precision_score(y_test,y_pred_rf,pos_label = 1) # 0.6046511627906976
confusion_matrix(y_test,y_pred_rf)

# 5.1 Using RF classifier on original data
clf1_rf = RandomForestClassifier(n_estimators =300)
clf1_rf.fit(X_train,y_train)
y1_pred_rf=clf1_rf.predict(X_test)
f1_score(y_test,y1_pred_rf, pos_label =1)       # 0.5846153846153846
recall_score(y_test,y1_pred_rf,pos_label = 1)   # 0.4318181818181818
precision_score(y_test,y1_pred_rf,pos_label = 1) # 0.9047619047619048
confusion_matrix(y_test,y1_pred_rf)

# 6.1 Plot confusion matrix in each case
plot_confusion_matrix(clf_rf, X_test,y_test)
plot_confusion_matrix(clf_gbm, X_test,y_test)


# 6.2 Plot both ROC curves on the same axes
#       Positive label is always 1 in these plots
fig = plt.figure()
ax = fig.subplots()
plot_roc_curve(
                 clf_gbm,            # Estimator instance
                 X_test, y_test,
                 response_method = 'auto',  # Default target response: predict_proba
                 ax =ax
               )

plot_roc_curve(
                clf_rf,
                X_test, y_test,
                ax =ax
               )

plt.show()


# 6.3 Plot precision recall curves
#       Positive label is always 1 in these plots
fig = plt.figure()
ax = fig.subplots()
plot_precision_recall_curve(clf_rf,  X_test, y_test, ax =ax)
plot_precision_recall_curve(clf_gbm, X_test, y_test, ax =ax)
plt.show()

##### I am done ############
