# Training machine learning models

## Loading training, validation, and test datasets

In [1]:
from pathlib import Path

import pandas as pd

ROOT = Path("__file__").resolve().parent
DATA = ROOT / 'data'
MODELS = ROOT / 'models'
TRAIN_AND_VAL_DS = DATA / 'US_2023_JUL_25_complete_cases_reviewd.csv'
TEST_DS = DATA / 'US_2023_AGO_01_all_cases_validationset.csv'

train_and_val_ds = pd.read_csv(TRAIN_AND_VAL_DS)
test_ds = pd.read_csv(TEST_DS)

retrospective = train_and_val_ds['study'] == 'retrospective'
prospective = train_and_val_ds['study'] == 'prospective'
birads_4a = train_and_val_ds['birads'] == '4a'
birads_4b = train_and_val_ds['birads'] == '4b'
prospective_4ab = prospective & (birads_4a | birads_4b)
prospective_other = prospective & (~birads_4a & ~birads_4b)

train_ds = train_and_val_ds[retrospective | prospective_other]
val_ds = train_and_val_ds[prospective_4ab]

print(train_ds.shape)
train_ds['birads'].value_counts()

(1236, 11)


birads
4a    357
5     308
4c    257
4b    196
3      63
6      55
Name: count, dtype: int64

In [2]:
print(val_ds.shape)
val_ds['birads'].value_counts()

(290, 11)


birads
4a    220
4b     70
Name: count, dtype: int64

In [3]:
print(test_ds.shape)
test_ds['birads'].value_counts()

(403, 13)


birads
4A    130
5      88
4C     71
6      55
4B     55
3       4
Name: count, dtype: int64

In [4]:
X_train = train_ds.drop(columns=['birads', 'result', 'study'])
y_train = train_ds['result']

X_val = val_ds.drop(columns=['birads', 'result', 'study'])
y_val = val_ds['result']

X_test = test_ds.drop(columns=['pt_id', 'img_id', 'birads', 'result', 'study'])
y_test = test_ds['result']

## Loading best parameters

In [5]:
import pickle

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

models = {
    'SVM': SVC,
    'RF' : RandomForestClassifier,
    'LR' : LogisticRegression,
    'DT' : DecisionTreeClassifier,
    'AB' : AdaBoostClassifier,
    'XB' : XGBClassifier,
    'KN' : KNeighborsClassifier,
    'MLP': MLPClassifier,
    'GB' : GaussianNB,
}

ct = ColumnTransformer([
        ('scaler', StandardScaler(), ['age', 'size']),
        ('encoder', OneHotEncoder(), ['margins']),
    ], remainder='passthrough')

best_params = None
with open(MODELS / 'grid_search_dictionary_all_2023_5_24.pkl', 'rb') as f:
    best_params = pickle.load(f)

pipelines = {name: Pipeline([
    ('ct', ct),
    (name, model(**best_params[name][0][0]))
]) for name, model in models.items()}

for name, model in pipelines.items():
    print('Fitting {}'.format(name))
    model.fit(X_train, y_train)

Fitting SVM
Fitting RF
Fitting LR
Fitting DT
Fitting AB
Fitting XB


  if is_sparse(data):


Fitting KN
Fitting MLP
Fitting GB




## Model comparison with bootstrap

In [6]:
from sklearn.metrics import classification_report
from sklearn.utils import resample

clf_df = pd.DataFrame(
    columns=['npv', 'sens', 'spec', 'ppv', 'acc', 'f1'],
    index=pipelines.keys()).fillna(0.0)

for i in range(100):
    re_X_val, re_y_val = resample(X_val, y_val, random_state=1)
    for name, model in pipelines.items():
        report = classification_report(re_y_val,
                                    model.predict(re_X_val),
                                    labels=[1, 0],
                                    output_dict=True)

        clf_df.loc[name] += pd.Series({
            'npv' : report['0']['precision'],
            'sens': report['1']['recall'],
            'spec': report['0']['recall'],
            'ppv' : report['1']['precision'],
            'acc' : report['accuracy'],
            'f1'  : report['1']['f1-score'],                
        })

# average 100 runs of bootstrap
clf_df *= 1.0/100
clf_df.sort_values('npv', ascending=False)

Unnamed: 0,npv,sens,spec,ppv,acc,f1
AB,0.973913,0.793103,0.858238,0.383333,0.851724,0.516854
DT,0.970443,0.793103,0.754789,0.264368,0.758621,0.396552
RF,0.970339,0.758621,0.877395,0.407407,0.865517,0.53012
XB,0.961207,0.689655,0.854406,0.344828,0.837931,0.45977
LR,0.960526,0.689655,0.83908,0.322581,0.824138,0.43956
MLP,0.959016,0.655172,0.896552,0.413043,0.872414,0.506667
KN,0.958333,0.655172,0.881226,0.38,0.858621,0.481013
GB,0.95279,0.62069,0.850575,0.315789,0.827586,0.418605
SVM,0.945378,0.551724,0.862069,0.307692,0.831034,0.395062


## Threshold tunning for improved NPV
    

In [7]:
FT = MODELS / 'finetune_2023_7_28.csv'

ft_df = pd.read_csv(FT, index_col=0)
ft_df

Unnamed: 0,thres
DT,0.17
LR,0.2
RF,0.28
SVM,0.16
MLP,0.28
KN,0.32
GB,-3.087808e-16
AB,0.48
XB,0.18


In [10]:
clf2_df = pd.DataFrame(
    columns=['npv', 'sens', 'spec', 'ppv', 'acc', 'f1'],
    index=pipelines.keys())

for name, model in pipelines.items():
    # keep scores for positive outcomes only
    y_scores = model.predict_proba(X_test)[:, 1]
    t = ft_df.loc[name, 'thres'] - 0.009
    y_pred_adj = [1 if y >= t else 0 for y in y_scores]
    report = classification_report(y_test,
                                   y_pred_adj,
                                   labels=[1, 0],
                                   output_dict=True)

    clf2_df.loc[name] = pd.Series({
        'npv' : report['0']['precision'],
        'sens': report['1']['recall'],
        'spec': report['0']['recall'],
        'ppv' : report['1']['precision'],
        'acc' : report['accuracy'],
        'f1'  : report['1']['f1-score'],                
    })

clf2_df.sort_values('npv', ascending=False)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,npv,sens,spec,ppv,acc,f1
AB,1.0,1.0,0.023121,0.576441,0.580645,0.73132
LR,0.978947,0.991304,0.537572,0.74026,0.796526,0.847584
SVM,0.978495,0.991304,0.526012,0.735484,0.791563,0.844444
XB,0.976744,0.991304,0.485549,0.719243,0.774194,0.833638
RF,0.963303,0.982609,0.606936,0.768707,0.82134,0.862595
KN,0.94,0.973913,0.543353,0.739274,0.789082,0.840525
DT,0.905882,0.965217,0.445087,0.698113,0.741935,0.810219
MLP,0.0,1.0,0.0,0.57072,0.57072,0.726698
GB,0.0,1.0,0.0,0.57072,0.57072,0.726698
