# Training machine learning models

## Loading training and validation datasets

In [7]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

ROOT = Path("__file__").resolve().parent
DATA = ROOT / 'data'
MODELS = ROOT / 'models'
TRAIN_AND_VAL_DS = DATA / 'US_2023_JUL_25_complete_cases_reviewd.csv'
TEST_DS = DATA / 'US_2023_AGO_01_all_cases_validationset.csv'

train_and_val_ds = pd.read_csv(TRAIN_AND_VAL_DS)
test_ds = pd.read_csv(TEST_DS)

retrospective = train_and_val_ds['study'] == 'retrospective'
prospective = train_and_val_ds['study'] == 'prospective'
birads_4a = train_and_val_ds['birads'] == '4a'
birads_4b = train_and_val_ds['birads'] == '4b'
prospective_4ab = prospective & (birads_4a | birads_4b)

extra4_ds, val_ds = train_test_split(train_and_val_ds[prospective_4ab], train_size=0.2, random_state=7)

train_ds = pd.concat([train_and_val_ds[retrospective], extra4_ds])

train_ds['birads'].value_counts()

birads
4a    407
5     225
4b    204
4c    184
3      44
6      18
Name: count, dtype: int64

In [8]:
val_ds['birads'].value_counts()

birads
4a    170
4b     62
Name: count, dtype: int64

In [9]:
test_ds['birads'].value_counts()

birads
4A    130
5      88
4C     71
6      55
4B     55
3       4
Name: count, dtype: int64

In [10]:
X_train = train_ds.drop(columns=['birads', 'result', 'study'])
y_train = train_ds['result']

X_val = val_ds.drop(columns=['birads', 'result', 'study'])
y_val = val_ds['result']

X_test = test_ds.drop(columns=['pt_id', 'img_id', 'birads', 'result', 'study'])
y_test = test_ds['result']

## Hyperparameter tuning

In [None]:
from joblib import dump

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from .models import pipelines, param_grids

best_clfs = {}

for name, pipe in pipelines.items():
    clf = None
    if name == 'XB' or name == 'RF':
        clf = RandomizedSearchCV(pipe,
                                 param_distributions=param_grids[name],
                                 n_iter=5,
                                 scoring='accuracy',
                                 n_jobs=4,
                                 cv=10,
                                 verbose=3)
    else:
        clf = GridSearchCV(pipe,
                           param_grid=param_grids[name],
                           scoring='accuracy',
                           cv=10)
    clf.fit(X_train, y_train)
    best_clfs[name] = (clf.best_params_, clf.score(X_train, y_train))

dump(best_clfs, MODELS / 'res_hyperparameter_tuning.joblib')

## Selecting between the different models

In [11]:
from joblib import load

# best_clfs = load(MODELS / 'res_hyperparameter_tuning.joblib')

# temporary:
import pickle
from sklearn.pipeline import Pipeline
from models import ct, models

best_params = None
with open(MODELS / 'grid_search_dictionary_all_2023_5_24.pkl', 'rb') as f:
    best_params = pickle.load(f)

pipelines = {name: Pipeline([
    ('ct', ct),
    (name, model(**best_params[name][0][0]))
]) for name, model in models.items()}

for name, model in pipelines.items():
    print('Fitting {}'.format(name))
    model.fit(X_train, y_train)

Fitting SVM
Fitting RF
Fitting LR
Fitting DT
Fitting AB
Fitting XB


  if is_sparse(data):


Fitting KN
Fitting MLP
Fitting GB




In [12]:
from sklearn.metrics import classification_report

clf_df = pd.DataFrame(
    columns=['npv', 'sens', 'spec', 'ppv', 'acc', 'f1'], index=pipelines.keys())

for name, model in pipelines.items():
    report = classification_report(y_val,
                                   model.predict(X_val),
                                   labels=[1, 0],
                                   output_dict=True)
    clf_df.loc[name] = {
        'npv' : report['0']['precision'],
        'sens': report['1']['recall'],
        'spec': report['0']['recall'],
        'ppv' : report['1']['precision'],
        'acc' : report['accuracy'],
        'f1'  : report['1']['f1-score'],
    }

clf_df.sort_values('npv', ascending=False)

Unnamed: 0,npv,sens,spec,ppv,acc,f1
DT,0.970149,0.666667,0.911215,0.387097,0.892241,0.489796
XB,0.969388,0.666667,0.88785,0.333333,0.87069,0.444444
RF,0.969231,0.666667,0.883178,0.324324,0.866379,0.436364
AB,0.968912,0.666667,0.873832,0.307692,0.857759,0.421053
SVM,0.968586,0.666667,0.864486,0.292683,0.849138,0.40678
KN,0.967742,0.666667,0.841121,0.26087,0.827586,0.375
MLP,0.965686,0.611111,0.920561,0.392857,0.896552,0.478261
GB,0.963918,0.611111,0.873832,0.289474,0.853448,0.392857
LR,0.963542,0.611111,0.864486,0.275,0.844828,0.37931
