![Title](https://i.imgur.com/TD4tCAo.png)

In [1]:
import pandas as pd
import numpy as np

In [2]:
iy = pd.read_csv('../dataset/iy12th.csv')

In [3]:
iy.columns

Index(['Unnamed: 0', 'Encounter', 'ID', 'Facility Name', 'Bus St Date',
       'Description', 'Inspect Dt', 'High', 'business_id', 'address', 'cuis',
       'ingr', 'sent', 'svat', 'unhy', 'stars_y', 'count'],
      dtype='object')

In [24]:
iy = iy[iy['count']>10]

In [25]:
iy.loc[iy['High']>0, 'isHigh'] = 1
iy['isHigh'] = iy['isHigh'].fillna(0)

In [26]:
# Features and target variable
features = ['cuis', 'ingr', 'sent', 'svat', 'unhy', 'stars_y', 'count']
target = 'isHigh'

In [27]:
# Preprocessing and Evaluation
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# split the dataset into predictors(X) and target (Y)
X = iy[features]
Y = iy[target].values.ravel()

## 6.A Data split for Cross-validation

In [32]:
# splitting into three sets
X_trainval, X_test, Y_trainval, Y_test = train_test_split(
    X, Y, random_state=0)

# Feature scaling
scaler = MinMaxScaler().fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

X_trainval = X_trainval_scaled
X_test = X_test_scaled

# Generate report
def getReport(report):
    report = report.strip().split()
    report.remove('avg')
    report.remove('/')
    report.insert(0, 'class')
    report = np.array(report)
    report = np.reshape(report, [-1,5])
    colname = report[0,:]
    report = report[1:,:]
    return pd.DataFrame(report, columns=colname)

In [33]:
acc = [] # list to store all performance metric

## 6.B Logistic Regression
The parameter C, inverse of regularization strength.

Tuning range: [0.001, 0.1, 1, 10, 100]

In [34]:
# Dataset with imputation
best_score=0
kfolds=5 # set the number of folds

for c in [0.001, 0.1, 1, 10, 100]:
    logRegModel = LogisticRegression(C=c)
    # perform cross-validation
    scores = cross_val_score(logRegModel, X_trainval, Y_trainval, cv=kfolds, scoring='accuracy') # Get recall for each parameter setting
    
    # compute mean cross-validation accuracy
    score = np.mean(scores)
    
    # Find the best parameters and score
    if score > best_score:
        best_score = score
        best_parameters = c

# rebuild a model on the combined training and validation set
SelectedLogRegModel = LogisticRegression(C=best_parameters).fit(X_trainval_scaled, Y_trainval)

test_score = SelectedLogRegModel.score(X_test_scaled, Y_test)
PredictedOutput = SelectedLogRegModel.predict(X_test_scaled)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter for regularization (C) is: ", best_parameters)
print("Test accuracy with best C parameter is", test_score)
print("Test recall with the best C parameter is", test_recall)
print("Test AUC with the best C parameter is", test_auc)
m = 'Logistic Regression'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.577030540117
Best parameter for regularization (C) is:  100
Test accuracy with best C parameter is 0.585121602289
Test recall with the best C parameter is 0.5
Test AUC with the best C parameter is 0.587758112094


## 6.C SVM
C: Penalty parameter C of the error term.

gamma: kernel coefficient. 

kernel: kernel type. 

In [18]:
best_score = 0

for c_paramter in [0.001, 0.1, 10, 100]: #iterate over the values we need to try for the parameter C
    for gamma_paramter in [0.001, 0.1, 10, 100]: #iterate over the values we need to try for the parameter gamma
        for k_parameter in ['rbf']: # iterate over the values we need to try for the kernel parameter
            print(c_paramter, gamma_paramter, k_parameter)
            svmModel = SVC(kernel=k_parameter, C=c_paramter, gamma=gamma_paramter) #define the model
            # perform cross-validation
            scores = cross_val_score(svmModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')
            # the training set will be split internally into training and cross validation

            # compute mean cross-validation accuracy
            score = np.mean(scores)
            # if we got a better score, store the score and parameters
            if score > best_score:
                best_score = score #store the score 
                best_parameter_c = c_paramter #store the parameter c
                best_parameter_gamma = gamma_paramter #store the parameter gamma
                best_parameter_k = k_parameter
            

# rebuild a model with best parameters to get score 
SelectedSVMmodel = SVC(C=best_parameter_c, gamma=best_parameter_gamma, kernel=best_parameter_k).fit(X_trainval_scaled, Y_trainval)

test_score = SelectedSVMmodel.score(X_test_scaled, Y_test)
PredictedOutput = SelectedSVMmodel.predict(X_test_scaled)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on cross validation set is:", best_score)
print("Best parameter for c is: ", best_parameter_c)
print("Best parameter for gamma is: ", best_parameter_gamma)
print("Best parameter for kernel is: ", best_parameter_k)
print("Test accuracy with the best parameters is", test_score)
print("Test recall with the best parameters is", test_recall)
print("Test recall with the best parameter is", test_auc)

m = 'SVM'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

0.001 0.001 rbf
0.001 0.1 rbf
0.001 10 rbf
0.001 100 rbf
0.1 0.001 rbf
0.1 0.1 rbf
0.1 10 rbf
0.1 100 rbf
10 0.001 rbf
10 0.1 rbf
10 10 rbf
10 100 rbf
100 0.001 rbf
100 0.1 rbf
100 10 rbf
100 100 rbf
Best accuracy on cross validation set is: 0.584816606236
Best parameter for c is:  100
Best parameter for gamma is:  0.1
Best parameter for kernel is:  rbf
Test accuracy with the best parameters is 0.578313253012
Test recall with the best parameters is 0.484507042254
Test recall with the best parameter is 0.573886174188


## 6.D Decision Tree
Maximum depth. [1, 2, ..., 5]


In [35]:
best_score = 0

for md in range(1, 6): # iterate different maximum depth values
    # train the model
    treeModel = DecisionTreeClassifier(random_state=0, max_depth=md, criterion='gini')
    # perform cross-validation
    scores = cross_val_score(treeModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')
    
    # compute mean cross-validation accuracy
    score = np.mean(scores)
    
    # if we got a better score, store the score and parameters
    if score > best_score:
        best_score = score
        best_parameter = md

# Rebuild a model on the combined training and validation set        
SelectedDTModel = DecisionTreeClassifier(max_depth=best_parameter).fit(X_trainval_scaled, Y_trainval )

test_score = SelectedDTModel.score(X_test_scaled, Y_test)
PredictedOutput = SelectedDTModel.predict(X_test_scaled)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter for the maximum depth is: ", best_parameter)
print("Test accuracy with best parameter is ", test_score)
print("Test recall with best parameters is ", test_recall)
print("Test AUC with the best parameter is ", test_auc)

m = 'Decision Tree'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.576536127692
Best parameter for the maximum depth is:  4
Test accuracy with best parameter is  0.55078683834
Test recall with best parameters is  0.447222222222
Test AUC with the best parameter is  0.553994591937


## 6.E Random Forest Classifier
n_estimators(M): the number of trees in the forest

max_features(d): the number of features to consider when looking for the best split

max_depth(m): the maximum depth of the tree. 

In [36]:
best_score = 0

for M in range(2, 30, 2): # combines M trees
    for d in range(1, 5): # maximum number of features considered at each split
        for m in range(1, 5): # maximum depth of the tree
            # train the model
            # n_jobs(4) is the number of parallel computing
            forestModel = RandomForestClassifier(n_estimators=M, max_features=d, n_jobs=-1,
                                          max_depth=m, random_state=0)
        
            # perform cross-validation
            scores = cross_val_score(forestModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')

            # compute mean cross-validation accuracy
            score = np.mean(scores)

            # if we got a better score, store the score and parameters
            if score > best_score:
                best_score = score
                best_M = M
                best_d = d
                best_m = m

# Rebuild a model on the combined training and validation set        
SelectedRFModel = RandomForestClassifier(n_estimators=M, max_features=d,
                                          max_depth=m, random_state=0).fit(X_trainval_scaled, Y_trainval )

PredictedOutput = SelectedRFModel.predict(X_test_scaled)
test_score = SelectedRFModel.score(X_test_scaled, Y_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameters of M, d, m are: ", best_M, best_d, best_m)
print("Test accuracy with the best parameters is", test_score)
print("Test recall with the best parameters is:", test_recall)
print("Test AUC with the best parameters is:", test_auc)

m = 'Random Forest'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.586560032105
Best parameters of M, d, m are:  28 3 2
Test accuracy with the best parameters is 0.565092989986
Test recall with the best parameters is: 0.447222222222
Test AUC with the best parameters is: 0.568743854474


## 6.F AdaBoost

In [37]:
best_score = 0

for M in range(2, 30, 2): # combines M trees
    for lr in [0.001, 0.01, 0.1, 1]:
        # train the model
        boostModel = AdaBoostClassifier(n_estimators=M, learning_rate=lr, random_state=0)

        # perform cross-validation
        scores = cross_val_score(boostModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')

        # compute mean cross-validation accuracy
        score = np.mean(scores)

        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_M = M
            best_lr = lr

# Rebuild a model on the combined training and validation set        
SelectedBoostModel = AdaBoostClassifier(n_estimators=M, learning_rate=lr, random_state=0).fit(X_trainval_scaled, Y_trainval )

PredictedOutput = SelectedBoostModel.predict(X_test_scaled)
test_score = SelectedRFModel.score(X_test_scaled, Y_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter of M is: ", best_M)
print("best parameter of LR is: ", best_lr)
print("Test accuracy with the best parameter is", test_score)
print("Test recall with the best parameters is:", test_recall)
print("Test AUC with the best parameters is:", test_auc)

m = 'AdaBoost'
acc.append([m, test_score, test_recall, test_auc, fpr, tpr, thresholds])

Best accuracy on validation set is: 0.590844615335
Best parameter of M is:  28
best parameter of LR is:  0.1
Test accuracy with the best parameter is 0.565092989986
Test recall with the best parameters is: 0.483333333333
Test AUC with the best parameters is: 0.567625368732


## Results

In [38]:
result = pd.DataFrame(acc, columns=['Model', 'Accuracy', 'Recall', 'AUC', 'TPR', 'FPR', 'TH'])
result[['Model', 'Accuracy', 'Recall', 'AUC']]

Unnamed: 0,Model,Accuracy,Recall,AUC
0,Logistic Regression,0.585122,0.5,0.587758
1,Decision Tree,0.550787,0.447222,0.553995
2,Random Forest,0.565093,0.447222,0.568744
3,AdaBoost,0.565093,0.483333,0.567625


In [14]:
result = pd.DataFrame(acc, columns=['Model', 'Accuracy', 'Recall', 'AUC', 'TPR', 'FPR', 'TH'])
result[['Model', 'Accuracy', 'Recall', 'AUC']]

Unnamed: 0,Model,Accuracy,Recall,AUC
0,Logistic Regression,0.580991,0.552113,0.579628
1,SVM,0.578313,0.484507,0.573886
2,Decision Tree,0.572959,0.450704,0.567189
3,Random Forest,0.576975,0.478873,0.572345
4,AdaBoost,0.576975,0.538028,0.567483


In [2]:
## End