# Model Training and Validation 

- Takes train and val scan IDs (i.e. individuals) from cross-val generator
- Uses train generator to sample slices from case and control train samples. Performs data augmentations (optional)
- Trains the model for a single hyper-param config. Reports performance on validation slices. 

### Imports

In [3]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
import sys
import os
import pickle

from tensorflow import keras
from tensorflow.keras import layers

sys.path.append('../')
from lib.training_utils import *

### Paths

In [4]:
root_data_dir = '' # local or Dropbox location for dicoms

metadata_file = '' # scan IDs, slice paths, lung partition, and demographic info
CV_dict_file = '' # k-fold generator 


### Configs

In [5]:
generate_CV_folds = True
RANDOM_SEED = 153
n_folds = 2

group_col = 'DX' #'partition_id' or 'DX'
case_label = 1
control_label = 0

slice_shape = (10,10)
num_classes = 2

### Load data

In [6]:
# Dummy data
metadata_df=pd.DataFrame()
n_scans = 100
n_parts = 3
metadata_df['CT_id'] = np.repeat(range(n_scans),n_parts)
metadata_df['DX'] = list(np.repeat(np.zeros(n_scans//2),n_parts)) + list(np.repeat(np.ones(n_scans//2),n_parts))
metadata_df['slice_label'] = metadata_df['DX'] #ideal case
metadata_df['slice_id']= range(n_scans*n_parts)

# Real data
# metadata_df  = pd.read_pickle(metadata_file)
# n_scans = len(metadata_df['CT_id'].unique())
# n_slices = len(np.hstack(metadata_df['slice_ids'].values))
# n_partitions = len(metadata_df['partition_id'].unique())
# n_cases = len(metadata_df[metadata_df[group_col]==case_label])
# n_controls = len(metadata_df[metadata_df[group_col]==control_label])

# print('Number of total scans: {}, slices: {}, partitions: {}\ncases: {}, controls: {}'.format())
if not generate_CV_folds:
    CV_index_dict = pickle.load(open(CV_dict_file, "rb"))
    
else:
    print('Generating CV folds to divide scans IDs balanced by groups...')
    CV_df = metadata_df[['CT_id',group_col]].drop_duplicates()

    X = CV_df['CT_id'].values
    y = CV_df[group_col].values

    skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_SEED, shuffle=True)
    skf.get_n_splits(X, y)
    
    CV_index_dict = {}
    cv = 0
    for train_index, test_index in skf.split(X, y):
        print("TRAIN:", train_index, "TEST:", test_index)
        slice_labels = metadata_df[metadata_df['CT_id'].isin(train_index)]['slice_label'].value_counts()
        print(slice_labels)
        CV_index_dict[cv] = {'train':train_index, 'test':test_index}
        cv = cv + 1
        

Generating CV folds to divide scans IDs balanced by groups...
TRAIN: [ 0  2  5  7  8  9 11 12 13 18 22 24 25 31 32 33 34 37 38 39 41 43 44 48
 49 50 52 53 55 56 57 61 62 63 64 65 67 68 69 70 72 74 80 82 83 85 89 93
 97 99] TEST: [ 1  3  4  6 10 14 15 16 17 19 20 21 23 26 27 28 29 30 35 36 40 42 45 46
 47 51 54 58 59 60 66 71 73 75 76 77 78 79 81 84 86 87 88 90 91 92 94 95
 96 98]
1.0    75
0.0    75
Name: slice_label, dtype: int64
TRAIN: [ 1  3  4  6 10 14 15 16 17 19 20 21 23 26 27 28 29 30 35 36 40 42 45 46
 47 51 54 58 59 60 66 71 73 75 76 77 78 79 81 84 86 87 88 90 91 92 94 95
 96 98] TEST: [ 0  2  5  7  8  9 11 12 13 18 22 24 25 31 32 33 34 37 38 39 41 43 44 48
 49 50 52 53 55 56 57 61 62 63 64 65 67 68 69 70 72 74 80 82 83 85 89 93
 97 99]
1.0    75
0.0    75
Name: slice_label, dtype: int64


### Generate train and test slice samples for a CV fold 

In [7]:
cv_idx = 0 
val_subset_frac = 0.2 #this is a subset within the train samples that is used to validate and select hyper-params

train_plus_val_index = CV_index_dict[cv_idx]['train']
train_sampx = int((1-val_subset_frac)*len(train_plus_val_index))
train_index = train_plus_val_index[:train_sampx]
val_index = train_plus_val_index[train_sampx:]

test_index = CV_index_dict[cv_idx]['test']

print('Number of scans\ntrain: {}, val: {}, test: {}'.format(len(train_index),len(val_index),len(test_index)))

# CT IDs
X_train, X_val, X_test = X[train_index], X[val_index], X[test_index]
y_train, y_val, y_test = y[train_index], y[val_index], y[test_index]

## Train generator
data_subset = 'train'
train_gen = batch_generator(X_train, metadata_df, data_subset, slice_shape, num_classes)

batch = next(train_gen)
print('number of slice samples in a {} batch: {}'.format(data_subset, len(batch[1])))

## Val generator
data_subset = 'val'
val_gen = batch_generator(X_val, metadata_df, data_subset, slice_shape, num_classes)

batch = next(val_gen)
print('number of slice samples in a {} batch: {}'.format(data_subset, len(batch[1])))

## Test generator
data_subset = 'test'
test_gen = batch_generator(X_test, metadata_df, data_subset, slice_shape, num_classes)

batch = next(test_gen)
print('number of slice samples in a {} batch: {}'.format(data_subset, len(batch[1])))

Number of scans
train: 40, val: 10, test: 50
number of slice samples in a train batch: 120
number of slice samples in a val batch: 30
number of slice samples in a test batch: 150


### Model config

In [8]:
# Model / data parameters
input_shape = (slice_shape[0], slice_shape[1], 1)

model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(8, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(16, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 8, 8, 8)           80        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 4, 4, 8)           0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 2, 2, 16)          1168      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 1, 16)          0         
_________________________________________________________________
flatten (Flatten)            (None, 16)                0         
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
____

### Train model

In [9]:
batch_size = 10
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(train_gen, validation_data=val_gen, epochs=epochs, steps_per_epoch=10, validation_steps=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fb5b9c399b0>

### Evaluate Model (at the moment on dummy data)

In [10]:
score = model.evaluate(test_gen, verbose=0, steps=10)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.7207905530929566
Test accuracy: 0.5
