## Step 1: Library import

In [3]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier


import xray_data #y data: 1 = NORMAL, 0 = PNEUMONIA
import random

random.seed(207)

## Step: Load and preprocess data
Note: data is resized and preprocessed as read, as a memory optimization.

In [4]:
#Constants
scale = 200
subset = 'PROP'
label_filter = ['NORMAL','PNEUMONIA','COVID19','TURBERCULOSIS']

In [5]:
X_dev_orig, y_dev_orig = xray_data.load_val(scale,label_filter)
print(f'X_dev_orig, y_dev_orig shape: {X_dev_orig.shape, y_dev_orig.shape}')
print(f'y_dev_orig shape for NORMAL cases: {y_dev_orig[y_dev_orig ==1].shape}')
print('----')

100% (10 of 10) |########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (8 of 8) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (8 of 8) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (12 of 12) |########################| Elapsed Time: 0:00:00 Time:  0:00:00


COVID19: 8
PNEUMONIA: 8
NORMAL: 8
TURBERCULOSIS: 8
Total: 32
X_dev_orig, y_dev_orig shape: ((32, 40000), (32,))
y_dev_orig shape for NORMAL cases: (8,)
----


In [6]:
# cut of for training samples of each class, only 230 normal rows

X_test, y_test = xray_data.load_test(scale,label_filter,subset = subset)
print(f'X_test, y_test shape: {X_test.shape, y_test.shape}')
print(f'y_test shape for NORMAL cases: {y_test[y_test ==1].shape}')
print('----')

100% (106 of 106) |######################| Elapsed Time: 0:00:10 Time:  0:00:10
100% (390 of 390) |######################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (234 of 234) |######################| Elapsed Time: 0:00:08 Time:  0:00:08
100% (41 of 41) |########################| Elapsed Time: 0:00:00 Time:  0:00:00


COVID19: 106
PNEUMONIA: 390
NORMAL: 234
TURBERCULOSIS: 41
Total: 771
X_test, y_test shape: ((771, 40000), (771,))
y_test shape for NORMAL cases: (390,)
----


In [7]:
# cut of for training samples of each class, only 1300 normal rows

X_train, y_train = xray_data.load_train(scale,label_filter,subset = subset)

100% (460 of 460) |######################| Elapsed Time: 0:00:37 Time:  0:00:37
100% (3875 of 3875) |####################| Elapsed Time: 0:01:13 Time:  0:01:13
100% (1341 of 1341) |####################| Elapsed Time: 0:00:56 Time:  0:00:56
100% (650 of 650) |######################| Elapsed Time: 0:00:10 Time:  0:00:10


COVID19: 460
PNEUMONIA: 3875
NORMAL: 1341
TURBERCULOSIS: 650
Total: 6326


In [8]:


X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = .2, stratify = y_train, random_state = 207 )
print(f'X_train, y_train shape: {X_train.shape, y_train.shape}')
print(f'y_train shape for NORMAL cases: {y_train[y_train ==1].shape}')
print('----')
print(f'X_dev, y_dev shape: {X_dev.shape, y_dev.shape}')
print(f'y_dev shape for NORMAL cases: {y_dev[y_dev ==1].shape}')

X_train, y_train shape: ((5060, 40000), (5060,))
y_train shape for NORMAL cases: (3099,)
----
X_dev, y_dev shape: ((1266, 40000), (1266,))
y_dev shape for NORMAL cases: (776,)


## Baseline Ensembles

First, using a hard voting structure where we just take the mode of the classifications from n models.

Second, using a soft voting structure in which the probabilities are weighed evenly. One issue with this is that the logistic regression model performs best when the data is scaled. On the flip side, the Naive Bayes model can't take in scaled data (i.e. negative values). So, we try with unscaled data to start. 

We simulate the creation of another ensemble model below for which we first scale the data passed into the logistic regression model, then find the optimal weight distribution across all models which maximizes the score.

In [17]:
from sklearn.ensemble import VotingClassifier

***HARD VOTING BASELINE ENSEMBLE***

In [18]:
#create an ensemble model that takes the mode of the classifications of the underlying single models
clf1 = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')
clf2 = svm.SVC(C = 10,gamma = .0001,kernel='rbf', probability = True)
clf3= LogisticRegression(max_iter = 1000)
clf4 = MultinomialNB(alpha = 71)


ensemble_hard = VotingClassifier(estimators=[('KNN', clf1),('SVM',clf2),('LR', clf3),('MNB',clf4)], voting='hard')
ensemble_hard = ensemble_hard.fit(X_train, y_train)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
#print hard voting mechanism score

ensemble_hard.score(X_dev, y_dev)

0.9454976303317536

***SOFT VOTING BASELINE ENSEMBLE***

In [20]:
#create an ensemble model that averages the predict_proba_ values for each individual values...
#...then makes a classification based off the average probability

ensemble_soft = VotingClassifier(estimators=[('KNN', clf1),('SVM',clf2),('LR', clf3),('MNB',clf4)], 
                             voting='soft'
                            )
ensemble_soft = ensemble_soft.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
#print soft voting mechanism score
ensemble_soft.score(X_dev, y_dev)

0.9447077409162717

In [23]:
pred = ensemble_soft.predict(X_dev)

In [19]:
from sklearn.metrics import plot_confusion_matrix

grid = confusion_matrix(y_dev, pred)
plt.imshow(grid, cmap = 'summer')

for (j,i),label in np.ndenumerate(grid):
    plt.text(i,j,label,ha='center',va='center')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks([0,1,2,3])
plt.yticks([0,1,2,3])

In [20]:
probs = ensemble_soft._collect_probas(X_dev)

## Weighted Average of Prediction Probabilities

Since we want to feed in a scaled version of the data for the logistic regression model to function best (and we can't feed standard scaled data to the multinomial NB model), we manually build an ensemble model using a soft voting system. 

We iterate (inefficiently) through various weights to see which yeilds the best performance for the ensemble model.

In [9]:
clf1 = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')
clf2 = svm.SVC(C = 10,gamma = .0001,kernel='rbf', probability = True)
clf3= LogisticRegression(max_iter = 1000)
clf4 = MultinomialNB(alpha = 71)

In [10]:
sc = StandardScaler()
sc.fit(X_train)
X_train_sc = sc.transform(X_train)
X_dev_sc = sc.transform(X_dev)

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train_sc, y_train)
clf4.fit(X_train, y_train)

prob1 = clf1.predict_proba(X_dev) #knn
prob2 = clf2.predict_proba(X_dev) #svm
prob3 = clf3.predict_proba(X_dev_sc) #log
prob4 = clf4.predict_proba(X_dev) #NB

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
prob1_weights = np.arange(0,31) #knn
prob2_weights = np.arange(0,31) #svm
prob3_weights = np.arange(0,31) #log
prob4_weights = np.arange(0,31) #NB
weights = []
scores = []


for i in prob1_weights:
    for j in prob2_weights:
        for k in prob3_weights:
            for l in prob4_weights:
                probs = (prob1*i+prob2*j+prob3*k+prob4*l)/4
                pred= probs.argmax(axis = 1)
                score = (pred == y_dev).sum()/pred.shape[0]
                weights.append([i,j,k,l])
                scores.append(score)
                
weights = np.array(weights)
scores = np.array(scores)

In [12]:
optimal_weights = (weights[scores.argmax()]/weights[scores.argmax()].sum())*100

In [13]:
highest_score = (scores[scores.argmax()])

In [14]:
print(f'Highest Ensemble Score: {highest_score}.')
print(f'Optimal Weights: {optimal_weights}.')

Highest Ensemble Score: 0.9605055292259084.
Optimal Weights: [ 7.69230769 57.69230769 30.76923077  3.84615385].


In [15]:
#create a dataframe of the weights applied to each individual model within the ensemble as well as the percentage
weight_df = pd.concat([pd.DataFrame(weights),
           pd.DataFrame(weights/weights.sum(axis=1, keepdims=True)).fillna(0)], axis =1)

  pd.DataFrame(weights/weights.sum(axis=1, keepdims=True)).fillna(0)], axis =1)


In [16]:
weight_df.columns = ['knn','svm','log','naive','knn_weight','svm_weight','log_weight','naive_weight']

In [17]:

#then drop the duplicate percentages
weight_df['score'] = scores
weight_df.drop_duplicates(subset = ['knn_weight','svm_weight','log_weight','naive_weight'])

Unnamed: 0,knn,svm,log,naive,knn_weight,svm_weight,log_weight,naive_weight,score
0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.211690
1,0,0,0,1,0.000000,0.000000,0.000000,1.000000,0.751185
31,0,0,1,0,0.000000,0.000000,1.000000,0.000000,0.936809
32,0,0,1,1,0.000000,0.000000,0.500000,0.500000,0.770142
33,0,0,1,2,0.000000,0.000000,0.333333,0.666667,0.754344
...,...,...,...,...,...,...,...,...,...
923503,30,30,30,13,0.291262,0.291262,0.291262,0.126214,0.953397
923507,30,30,30,17,0.280374,0.280374,0.280374,0.158879,0.950237
923509,30,30,30,19,0.275229,0.275229,0.275229,0.174312,0.949447
923513,30,30,30,23,0.265487,0.265487,0.265487,0.203540,0.947867


In [18]:
weight_df[(weight_df.knn +weight_df.svm +weight_df.log + weight_df.naive == 1)]

Unnamed: 0,knn,svm,log,naive,knn_weight,svm_weight,log_weight,naive_weight,score
1,0,0,0,1,0.0,0.0,0.0,1.0,0.751185
31,0,0,1,0,0.0,0.0,1.0,0.0,0.936809
961,0,1,0,0,0.0,1.0,0.0,0.0,0.955766
29791,1,0,0,0,1.0,0.0,0.0,0.0,0.893365


# Bagging Ensemble

Model using parameters from our best guess model from our non-ensemble versions

In [None]:
MLPclf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), max_iter = 1000, random_state=1)
baggingclf = BaggingClassifier(MLPclf, max_samples=0.5, max_features=0.5)

baggingclf.fit(X_train, y_train)

In [None]:
bagging_pred_dev = baggingclf.predict(X_dev)
print(f'Bagging Multi Layer Perceptron Model Accuracy: {f1_score(bagging_pred_dev,y_dev,average="weighted")*100:9.5}')


# Adaboost Ensemble

AdaBoost uses low accuracy classifiers, with weighting.  Given the complexity of our feature space we would not expect simple classifiers to be very good predictors, even when combined.


In [None]:
AB_clf = AdaBoostClassifier(n_estimators=100)

AB_clf.fit(X_train, y_train)


In [None]:
AB_clf_pred_dev = AB_clf.predict(X_dev)
print(f'Adaboost Model Accuracy: {f1_score(AB_clf_pred_dev,y_dev,average="weighted")*100:9.5}')