## Step 1: Library import

In [19]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.model_selection import GridSearchCV

#import xray_data #y data: 1 = NORMAL, 0 = PNEUMONIA

In [2]:
import NIHCC_xray_data


86524 images in training set
25596 images in test set
valid tags for dataset:  ['Cardiomegaly', 'No Finding', 'Hernia', 'Infiltration', 'Nodule', 'Emphysema', 'Effusion', 'Atelectasis', 'Mass', 'Pneumothorax', 'Pleural_Thickening', 'Fibrosis', 'Consolidation', 'Edema', 'Pneumonia']


## Step 2: Load and preprocess data
Note: data is resized and preprocessed as it is read, as a memory optimization.

In [3]:
#Constants
scale = 200
# Kaggle data options
#label_filter = ['NORMAL','PNEUMONIA','COVID19','TURBERCULOSIS']
#NIHCC options
label_filter = ['Cardiomegaly', 'No Finding', 'Hernia', 'Infiltration', 
                'Nodule', 'Emphysema', 'Effusion', 'Atelectasis', 
                'Mass', 'Pneumothorax', 'Pleural_Thickening', 'Fibrosis',
                'Consolidation', 'Edema', 'Pneumonia']
subset = 'PROP' # either "EQL" or "PROP"

In [4]:
# cut of for training samples of each class, only 230 normal rows
test_cutoff =1000

X_test, y_test = NIHCC_xray_data.load_test(scale,label_filter,test_cutoff,subset=subset)
print(f'X_test, y_test shape: {X_test.shape, y_test.shape}')
print(f'y_test shape for NORMAL cases: {y_test[y_test ==1].shape}')
print('----')

100% (17853 of 17853) |##################| Elapsed Time: 0:00:47 Time:  0:00:47


Hernia: 2
Mass: 24
Pneumothorax: 53
No Finding: 552
Emphysema: 17
Cardiomegaly: 17
Infiltration: 124
Pleural_Thickening: 17
Effusion: 65
Consolidation: 26
Edema: 12
Atelectasis: 44
Fibrosis: 9
Pneumonia: 4
Nodule: 25
Total: 991
X_test, y_test shape: ((991, 40000), (991,))
y_test shape for NORMAL cases: (552,)
----


In [5]:
# cut of for training samples of each class, only 1300 normal rows
train_cutoff = 10000

X_train, y_train = NIHCC_xray_data.load_train(scale,label_filter,max_count=train_cutoff,subset=subset)


100% (73471 of 73471) |##################| Elapsed Time: 0:09:03 Time:  0:09:03


Cardiomegaly: 105
No Finding: 6873
Infiltration: 997
Nodule: 305
Emphysema: 79
Effusion: 379
Atelectasis: 464
Pleural_Thickening: 111
Fibrosis: 74
Mass: 230
Pneumonia: 31
Pneumothorax: 168
Hernia: 8
Consolidation: 112
Edema: 54
Total: 9990


#### Step2a: Adjust all image sets by mean/std of train

In [6]:
#(mean,std) = xray_data.find_mean_std(X_train)
#print(mean,std)
#print(X_train.shape)
#adjusted_X_train = xray_data.normalize_images(X_train,mean,std)
#print (f'adjusted train {adjusted_X_train.shape}')
#adjusted_X_dev_org = xray_data.normalize_images(X_dev_orig,mean,std)
#adjusted_X_test = xray_data.normalize_images(X_test,mean,std)
# uncomment to use rescaled data
# X_train = adjusted_X_train
# X_dev_org = adjusted_X_dev_orig
# X_test = adjusted_X_test

### Step 2b: Split data into dev and train
Original dev data set is too small for much validity


In [7]:
# I'm not sure what the final stratify parameter is doing.  We'll want to revist when we do full cross validation

X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = .1, stratify = y_train )
print(f'X_train, y_train shape: {X_train.shape, y_train.shape}')
print(f'y_train shape for NORMAL cases: {y_train[y_train ==1].shape}')
print('----')
print(f'X_dev, y_dev shape: {X_dev.shape, y_dev.shape}')
print(f'y_dev shape for NORMAL cases: {y_dev[y_dev ==1].shape}')

X_train, y_train shape: ((8991, 40000), (8991,))
y_train shape for NORMAL cases: (6186,)
----
X_dev, y_dev shape: ((999, 40000), (999,))
y_dev shape for NORMAL cases: (687,)


## Step 3: Single Model training

Default values:  The default model for SVC is 
 - C=1, 
 - kernel='rbf', 
 - gamma=1/(n_features * X.var()).  For a 200x200 rescale this results in gamma = .000025/X.var().  We may want to go with smaller gamma values.



In [8]:

base_model = svm.SVC()

base_model.fit(X_train,y_train)



Single Model accuracy:    68.769


In [None]:
y_pred = base_model.predict(X_dev)

In [25]:
print(f'Single Model accuracy: {accuracy_score(y_pred,y_dev)*100:9.5}')
f1score = f1_score(y_pred,y_dev,average='weighted')

print(f'Single Model f1_score: {f1score*100:9.5}')

Single Model accuracy:    68.769
Single Model f1_score:    81.495


## Step 4  CrossValidated Gridsearch models

In [13]:
param_grid={'C':[10],'gamma':[0.0001],'kernel':['rbf']}

svc = svm.SVC()
CV_model = GridSearchCV(svc,param_grid,verbose=3)

CV_model.fit(X_train,y_train)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.687 total time=24.3min
[CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.688 total time=23.9min
[CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.688 total time=24.0min
[CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.688 total time=24.1min
[CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.688 total time=23.7min


GridSearchCV(estimator=SVC(),
             param_grid={'C': [10], 'gamma': [0.0001], 'kernel': ['rbf']},
             verbose=3)

In [14]:
print(f'Best score: {CV_model.best_score_*100:9.5}')
print(f'Best params: {CV_model.best_params_}')

Best score:     68.78
Best params: {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}


In [27]:
y_pred = CV_model.predict(X_dev)

In [28]:
print(f'Best Fit Model accuracy: {accuracy_score(y_pred,y_dev)*100:9.5}')

f1score = f1_score(y_pred,y_dev,average='weighted')

print(f'Best Fit Model f1_score: {f1score*100:9.5}')

Best Fit Model accuracy:    68.769
Best Fit Model f1_score:    81.495


In [29]:
confusion_matrix(y_dev,y_pred)


array([[  0,  11,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0, 687,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0, 100,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,  31,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   8,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,  38,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,  46,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,  17,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,  11,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
       