# Quantitative exercise
By Group 1: Ya Ting Hu & Zhen Tian

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import random
import collections
import io
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import *
from imblearn.metrics import *

In [2]:
random.seed(1234)
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/haberman.csv'
dataframe = pd.read_csv(url, header=None)
dataframe = dataframe.rename(columns={0: "age", 1: "year", 2:"nodes",3:"survival"})
dataframe["survival"].replace({2:0},inplace=True)
values = dataframe.values
X, y = values[:, :-1], values[:, -1]

report_total = pd.DataFrame()

In [3]:
def sample_class_dist(X, y, method=False, strategy=False, verbose=False):
    global report_total
    
    n_rows = X.shape[0]
    n_cols = X.shape[1]
    classes = np.unique(y)
    n_classes = len(classes)
    if method and strategy:
        print(str(" " + strategy + ": "+ method +" ").center(80, "="))
        
    
    if verbose == True:
        print('N Examples: %d' % n_rows)
        print('N Inputs: %d' % n_cols)
        print('N Classes: %d' % n_classes)
        print('Classes: %s' % classes)
    print('Class Breakdown:')

    breakdown = ''
    class_ = {}
    for c in classes:
        total = len(y[y == c])
        ratio = (total / float(len(y))) * 100
        class_[str(c)] = total
        print(' - Class %s: %d (%.5f%%)' % (str(c), total, ratio))

    # ratio of the number of instances from the negative class and the positive class
    try:
        print("Imbalanced ratio (IR): ", round(y.value_counts()[0]/y.value_counts()[1],2))
    except:
        print("Imbalanced ratio (IR): ", round(collections.Counter(y)[1]/collections.Counter(y)[0],2))

    if not verbose:
        df_sample = pd.DataFrame(
        {
            "strategy": [strategy]*3,
            "method":   [method]*3,
            "class":['0', '1', 'avg / total'],
            "count":[class_["0"], class_["1"], ""],
            }
        )

        df_sample.set_index(["strategy","class"], inplace=True)

        report_total = pd.concat([report_total, df_sample], axis=0)
#         display(report_total)

In [4]:
sample_class_dist(X,y, verbose=True)

N Examples: 306
N Inputs: 3
N Classes: 2
Classes: [0 1]
Class Breakdown:
 - Class 0: 81 (26.47059%)
 - Class 1: 225 (73.52941%)
Imbalanced ratio (IR):  2.78


# Split 80% 20%

In [5]:
dataframe.head()

Unnamed: 0,age,year,nodes,survival
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [6]:
from sklearn.model_selection import train_test_split

# defining the dependent and independent variables
Xtrain = dataframe.loc[:, dataframe.columns != "survival"]
ytrain = dataframe["survival"]

X_train, X_test, y_train, y_test = train_test_split(Xtrain,ytrain, train_size=0.8, random_state=0)

data_dict = {}
strategies = []

# Undersampling


In [7]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
strategy = "RepeatedEditedNearestNeighbours"
enn = RepeatedEditedNearestNeighbours()
X_ENN_UnderSampled, y_ENN_UnnderSampled = enn.fit_resample(X_train, y_train)
strategies.append(strategy)
data_dict[strategy] = [X_ENN_UnderSampled, y_ENN_UnnderSampled]
sample_class_dist(X_ENN_UnderSampled, y_ENN_UnnderSampled, method = "Undersampling", strategy = strategy)

from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule()
strategy = "NeighbourhoodCleaningRule"
strategies.append(strategy)
X_NCR_UnderSampled, y_NCR_UnnderSampled = ncr.fit_resample(X_train, y_train)
data_dict[strategy] = [X_NCR_UnderSampled, y_NCR_UnnderSampled]

sample_class_dist(X_NCR_UnderSampled, y_NCR_UnnderSampled, method = "Undersampling", strategy = strategy)

Class Breakdown:
 - Class 0: 56 (40.87591%)
 - Class 1: 81 (59.12409%)
Imbalanced ratio (IR):  0.69
Class Breakdown:
 - Class 0: 56 (28.57143%)
 - Class 1: 140 (71.42857%)
Imbalanced ratio (IR):  0.4


# Oversampling

In [8]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
strategy = "RandomOverSampler"
X_OverSampled, y_OverSampled = ros.fit_resample(X_train, y_train)
data_dict[strategy] = [X_OverSampled, y_OverSampled]
sample_class_dist(X_OverSampled, y_OverSampled, method = "Oversampling", strategy = strategy)

from imblearn.over_sampling import SMOTEN
sampler = SMOTEN(random_state=0)
strategy = "SMOTEN"
X_SMOTEN, y_SMOTEN = sampler.fit_resample(X_train, y_train)
data_dict[strategy] = [X_SMOTEN, y_SMOTEN]
sample_class_dist(X_SMOTEN, y_SMOTEN, method = "Oversampling", strategy = strategy)

Class Breakdown:
 - Class 0: 188 (50.00000%)
 - Class 1: 188 (50.00000%)
Imbalanced ratio (IR):  1.0
Class Breakdown:
 - Class 0: 188 (50.00000%)
 - Class 1: 188 (50.00000%)
Imbalanced ratio (IR):  1.0


# Hybrid

In [9]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
strategy = "SMOTEENN"
X_Hybrid_SMOTEENN, y_Hybrid_SMOTEENN = smote_enn.fit_resample(X_train, y_train)
data_dict[strategy] = [X_Hybrid_SMOTEENN, y_Hybrid_SMOTEENN]
sample_class_dist(X_Hybrid_SMOTEENN, y_Hybrid_SMOTEENN, method = "Hybrid", strategy = strategy)

Class Breakdown:
 - Class 0: 100 (51.81347%)
 - Class 1: 93 (48.18653%)
Imbalanced ratio (IR):  1.08


In [10]:
report_ = pd.DataFrame()
for sampling_strategy in data_dict.keys():
    clf = DecisionTreeClassifier(random_state=0)
    y_pred = clf.fit(data_dict[sampling_strategy][0], data_dict[sampling_strategy][1]).predict(X_test)
    
    report_str = classification_report_imbalanced(y_test, y_pred)
    df_report = pd.read_csv(io.StringIO(report_str),sep="      ")
    df_report["roc_score"] = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    df_report["sampling_strategy"] = sampling_strategy
    df_report.reset_index(inplace=True)
    df_report = df_report.rename({"index":"class"},axis=1)
    df_report.set_index(["sampling_strategy", "class"], inplace=True)
    report_ = pd.concat([df_report,report_],axis=0)
report_total = pd.concat([report_,report_total],axis=1)

In [11]:
#Text summary of the precision, recall, specificity, geometric mean, and index balanced accuracy    
#https://imbalanced-learn.org/stable/references/generated/imblearn.metrics.classification_report_imbalanced.html#imblearn.metrics.classification_report_imbalanced
report_total.sort_values("method")

Unnamed: 0_level_0,Unnamed: 1_level_0,pre,rec,spe,f1,geo,iba,sup,roc_score,method,count
Unnamed: 0_level_1,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SMOTEENN,0,0.54,0.52,0.7,0.53,0.6,0.36,25,0.611351,Hybrid,100.0
SMOTEENN,1,0.68,0.7,0.52,0.69,0.6,0.37,37,0.611351,Hybrid,93.0
SMOTEENN,avg / total,0.63,0.63,0.59,0.63,0.6,0.37,62,0.611351,Hybrid,
RandomOverSampler,0,0.42,0.2,0.81,0.27,0.4,0.15,25,0.505405,Oversampling,188.0
RandomOverSampler,1,0.6,0.81,0.2,0.69,0.4,0.17,37,0.505405,Oversampling,188.0
RandomOverSampler,avg / total,0.53,0.56,0.45,0.52,0.4,0.16,62,0.505405,Oversampling,
SMOTEN,0,0.25,0.16,0.68,0.2,0.33,0.1,25,0.417838,Oversampling,188.0
SMOTEN,1,0.54,0.68,0.16,0.6,0.33,0.11,37,0.417838,Oversampling,188.0
SMOTEN,avg / total,0.43,0.47,0.37,0.44,0.33,0.11,62,0.417838,Oversampling,
NeighbourhoodCleaningRule,0,0.54,0.52,0.7,0.53,0.6,0.36,25,0.611351,Undersampling,56.0


##### Logistic

In [12]:
# Xtest = test.loc[:, train.columns != "survival"]
# ytest = test["survival"]

# import statsmodels.api as sm
# import pandas as pd
 
# # defining the dependent and independent variables
# Xtrain = train.loc[:, train.columns != "survival"]
# ytrain = train["survival"]
  
# # building the model and fitting the data
# log_reg = sm.Logit(ytrain, Xtrain).fit();

# # performing predictions on the test datdaset
# yhat = log_reg.predict(Xtest)
# prediction = list(map(round, yhat))
 
# # comparing original and predicted values of y
# print('Actual values', list(ytest.values))
# print('Predictions :', prediction)

In [13]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score,roc_curve, auc

# clf_LogisticRegression = LogisticRegression(random_state=0).fit(Xtrain, ytrain)
# y_score = clf_LogisticRegression.decision_function(Xtest)
# pred_prob1 = clf_LogisticRegression.predict_proba(Xtest)



In [14]:
# y_score = clf.decision_function(Xtest)
# pred_prob1 = clf.predict_proba(Xtest)
# # roc curve for models
# fpr1, tpr1, thresh1 = roc_curve(ytest, pred_prob1[:,1], pos_label=1)
# # fpr2, tpr2, thresh2 = roc_curve(ytest, pred_prob2[:,1], pos_label=1)

# # roc curve for tpr = fpr 
# random_probs = [0 for i in range(len(ytest))]
# p_fpr, p_tpr, _ = roc_curve(ytest, random_probs, pos_label=1)

# # auc scores
# auc_score1 = roc_auc_score(ytest, pred_prob1[:,1])
# # auc_score2 = roc_auc_score(ytest, pred_prob2[:,1])

# print("Auc score Logistic Regression:", auc_score1)

In [15]:
# # matplotlib
# import matplotlib.pyplot as plt
# plt.style.use('seaborn')

# # plot roc curves
# plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
# # plt.plot(fpr2, tpr2, linestyle='--',color='green', label='KNN')
# plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# # title
# plt.title('ROC curve')
# # x label
# plt.xlabel('False Positive Rate')
# # y label
# plt.ylabel('True Positive rate')

# plt.legend(loc='best')
# # plt.savefig('ROC',dpi=300)
# plt.show();