## About

In this notebook, I trained EfficientNet-B3. The model trained here can be used to run the inference notebook.

# Setting

In [1]:
!pip install ../input/kerasapplications/keras-team-keras-applications-3b180cb -f ./ --no-index
!pip install ../input/efficientnet/efficientnet-1.1.0/ -f ./ --no-index

Looking in links: ./
Processing /kaggle/input/kerasapplications/keras-team-keras-applications-3b180cb
Building wheels for collected packages: Keras-Applications
  Building wheel for Keras-Applications (setup.py) ... [?25l- \ done
[?25h  Created wheel for Keras-Applications: filename=Keras_Applications-1.0.8-py3-none-any.whl size=50704 sha256=260f6469a8bc17a26efe860298399d7d31a3ce4e0b11489cac95d273f068d8a4
  Stored in directory: /root/.cache/pip/wheels/f4/96/13/eccdd9391bd8df958d78851b98ec4dc207ba05b67b011eb70a
Successfully built Keras-Applications
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Looking in links: ./
Processing /kaggle/input/efficientnet/efficientnet-1.1.0
Building wheels for collected packages: efficientnet
  Building wheel for efficientnet (setup.py) ... [?25l- done
[?25h  Created wheel for efficientnet: filename=efficientnet-1.1.0-py3-none-any.whl size=14141 sha256=d892a7deae453c3c2bc89c1d769a902

In [2]:
# Import Libraries
import os
import cv2
import pydicom
import pandas as pd
import numpy as np 
import random


from tqdm.notebook import tqdm 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.cluster import KMeans
from plotnine import *
import matplotlib.pyplot as plt 

import tensorflow as tf 
from tensorflow.keras.layers import (
    Dense, Dropout, Activation, Flatten, Input, BatchNormalization, GlobalAveragePooling2D, Add, Conv2D, AveragePooling2D, 
    LeakyReLU, Concatenate 
)
from tensorflow.keras import Model
from tensorflow.keras.utils import Sequence
import tensorflow.keras.backend as K
import tensorflow.keras.applications as tfa
import efficientnet.tfkeras as efn


import seaborn as sns
import time
import json

In [3]:
# Setting Seeds
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    
seed_everything(42)

In [4]:
# Setting TensorFlow
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [5]:
# Load Setting File
json_open = open('../input/settings/settings.json')
SETTINGS = json.load(json_open)

PATH = SETTINGS['PATH']
PARAMS = SETTINGS['PARAMS']
PARAMS_TRAIN_EFF = SETTINGS['PARAMS_TRAIN_EFF']

print(SETTINGS)

{'PATH': {'ROOT': '../input/', 'TRAIN_CSV_PATH': '../input/osic-pulmonary-fibrosis-progression/train.csv', 'TEST_CSV_PATH': '../input/osic-pulmonary-fibrosis-progression/test.csv', 'SAMPLESUB_CSV_PATH': '../input/osic-pulmonary-fibrosis-progression/sample_submission.csv', 'TRAIN_DATA_DIR': '../input/osic-pulmonary-fibrosis-progression/train/', 'TEST_DATA_DIR': '../input/osic-pulmonary-fibrosis-progression/test/', 'MASK_NOISE_DIR': '../input/osic-pulmonary-fibrosis-progression-lungs-mask/mask_noise/mask_noise/', 'TRAIN_MODEL_WEIGHTS_DIR': '../input/training-osic-2nd-place-code/'}, 'PARAMS': {'MODELS_EFF': [0], 'WEIGHT_EFF': 0.2, 'NFOLDS': 4, 'BATCH_SIZE': 128, 'EPOCHS': 300, 'EARLY_STOPPING': 150, 'NUM_LAST_FVC': 2, 'QS1': 0.2, 'QS2': 0.5, 'QS3': 0.8}, 'PARAMS_TRAIN_EFF': {'EPOCHS': 50, 'BATCH_SIZE': 8, 'NFOLDS': 5, 'LR': 0.003, 'MODEL_CLASS': 'b3'}, 'FEATURES': ['Sex_Female', 'Sex_Male', 'SmokingStatus_Currently smokes', 'SmokingStatus_Ex-smoker', 'SmokingStatus_Never smoked', 'age', '

# Preprocess

## Cluster Patients

In [6]:
# Load Train Data
train_df = pd.read_csv(PATH["TRAIN_CSV_PATH"]) 

# Linear Regression with the last few FVC
training_patient = train_df[~train_df.duplicated(subset=['Patient'])]

for patient in tqdm(train_df['Patient'].unique()): 
    idx = train_df['Patient'] == patient
    z = ((train_df.loc[idx, 'FVC'].values[-PARAMS['NUM_LAST_FVC']:] - train_df.loc[idx, 'FVC'].values[-PARAMS['NUM_LAST_FVC']:].mean()) /
         train_df.loc[idx, 'FVC'].values[-PARAMS['NUM_LAST_FVC']:].std())
    reg = LinearRegression(normalize=True,fit_intercept=True).fit(train_df.loc[idx, 'Weeks'].values[-PARAMS['NUM_LAST_FVC']:].reshape(-1,1),z)
    train_df.loc[idx, 'Intercept_2'] = reg.intercept_
    train_df.loc[idx, 'Coef_2'] = reg.coef_[0]
    
training_patient = train_df.drop_duplicates('Patient') 
(ggplot(training_patient) + aes(x='Intercept_2',y='Coef_2',fill='Sex',size='FVC') + geom_point(alpha=0.4) )

# Clustering Patients
cust_array = np.array([training_patient['Intercept_2'].tolist(),training_patient['Coef_2'].tolist()])
cust_array = cust_array.T

kmeans = KMeans(n_clusters=4, random_state=0)
clusters = kmeans.fit(cust_array)
training_patient['Group'] = clusters.labels_

(ggplot(training_patient) + aes(x='Intercept_2',y='Coef_2',fill='Group') + geom_point(alpha=0.4) )

train_df = pd.merge(train_df, training_patient[['Patient','Group']],on='Patient', how='inner')

HBox(children=(FloatProgress(value=0.0, max=176.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Feature Engineering

In [7]:
# Get Tabular Data
def get_tab(df):
    vector = [(df.Age.values[0] - 30) / 30] 
    
    if df.Sex.values[0] == 'male':
       vector.append(0)
    else:
       vector.append(1)
    
    if df.SmokingStatus.values[0] == 'Never smoked':
        vector.extend([0,0])
    elif df.SmokingStatus.values[0] == 'Ex-smoker':
        vector.extend([1,1])
    elif df.SmokingStatus.values[0] == 'Currently smokes':
        vector.extend([0,1])
    else:
        vector.extend([1,0])
    return np.array(vector) 


A = {} 
TAB = {} 
P = [] 
G = []
for i, p in tqdm(enumerate(train_df['Patient'].unique())):
    sub = train_df.loc[train_df['Patient'] == p, :] 
    fvc = sub['FVC'].values
    weeks = sub['Weeks'].values
    group = sub['Group'].values[0]
    c = np.vstack([weeks, np.ones(len(weeks))]).T
    a, b = np.linalg.lstsq(c, fvc)[0]
    
    A[p] = a
    TAB[p] = get_tab(sub)
    P.append(p)
    G.append(group)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))






# Training

## Define Model

In [8]:
# Get Image Data
def get_img(path):
    d = pydicom.dcmread(path)
    return cv2.resize((d.pixel_array - d.RescaleIntercept) / (d.RescaleSlope * 1000), (512, 512))


# Load Data
class IGenerator(Sequence):
    BAD_ID = ['ID00011637202177653955184', 'ID00052637202186188008618']
    def __init__(self, keys, a, tab, batch_size=PARAMS_TRAIN_EFF['BATCH_SIZE']):
        self.keys = [k for k in keys if k not in self.BAD_ID]
        self.a = a
        self.tab = tab
        self.batch_size = batch_size
        
        self.train_data = {}
        for p in train_df['Patient'].values:
            self.train_data[p] = os.listdir(f'{PATH["TRAIN_DATA_DIR"]}{p}/')
    
    def __len__(self):
        return 1000
    
    def __getitem__(self, idx):
        x = []
        a, tab = [], [] 
        keys = np.random.choice(self.keys, size = self.batch_size)
        for k in keys:
            try:
                i = np.random.choice(self.train_data[k], size=1)[0]
                img = get_img(f'{PATH["TRAIN_DATA_DIR"]}{k}/{i}')
                x.append(img)
                a.append(self.a[k])
                tab.append(self.tab[k])
            except:
                print(k, i)
       
        x,a,tab = np.array(x), np.array(a), np.array(tab)
        x = np.expand_dims(x, axis=-1)
        return [x, tab] , a
    
    
# Make Model
def get_efficientnet(model, shape):
    models_dict = {
        'b0': efn.EfficientNetB0(input_shape=shape,weights=None,include_top=False),
        'b1': efn.EfficientNetB1(input_shape=shape,weights=None,include_top=False),
        'b2': efn.EfficientNetB2(input_shape=shape,weights=None,include_top=False),
        'b3': efn.EfficientNetB3(input_shape=shape,weights=None,include_top=False),
        'b4': efn.EfficientNetB4(input_shape=shape,weights=None,include_top=False),
        'b5': efn.EfficientNetB5(input_shape=shape,weights=None,include_top=False),
        'b6': efn.EfficientNetB6(input_shape=shape,weights=None,include_top=False),
        'b7': efn.EfficientNetB7(input_shape=shape,weights=None,include_top=False)
    }
    return models_dict[model]

def build_model(shape=(512, 512, 1), model_class=None):
    inp = Input(shape=shape)
    base = get_efficientnet(model_class, shape)
    x = base(inp)
    x = GlobalAveragePooling2D()(x)
    inp2 = Input(shape=(4,))
    x2 = tf.keras.layers.GaussianNoise(0.2)(inp2)
    x = Concatenate()([x, x2]) 
    x = Dropout(0.5)(x) 
    x = Dense(1)(x)
    model = Model([inp, inp2] , x)
    return model

## Train

In [9]:
# Split patients data with StratifiedKFold based on the above grouping.
P = np.array(P)
G = np.array(G)

skf = StratifiedKFold(n_splits = PARAMS_TRAIN_EFF['NFOLDS'])
splitter = skf.split(P,G)

# Cross-Validation
subs = []
folds_history = []
for fold, (tr_idx, val_idx) in enumerate(splitter):
    print('#####################')
    print('####### Fold %i ######'%fold)
    print('#####################')
    print('Training...')
    
    er = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=1e-3,
        patience=10,
        verbose=1,
        mode="auto",
        baseline=None,
        restore_best_weights=True,
    )

    cpt = tf.keras.callbacks.ModelCheckpoint(
        filepath='fold-%i.h5'%fold,
        monitor='val_loss', 
        verbose=1, 
        save_best_only=True,
        mode='auto'
    )

    rlp = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.5,
        patience=5, 
        verbose=1, 
        min_lr=1e-8
    )
    model = build_model(model_class=PARAMS_TRAIN_EFF['MODEL_CLASS'])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=PARAMS_TRAIN_EFF['LR']), loss="mae") 
    history = model.fit_generator(IGenerator(keys=P[tr_idx], 
                                   a = A, 
                                   tab = TAB), 
                        steps_per_epoch = 32,
                        validation_data=IGenerator(keys=P[val_idx], 
                                   a = A, 
                                   tab = TAB),
                        validation_steps = 16, 
                        callbacks = [cpt, rlp], 
                        epochs=PARAMS_TRAIN_EFF['EPOCHS'])
    folds_history.append(history.history)
    print('Training done!')



#####################
####### Fold 0 ######
#####################
Training...
Epoch 1/50
Epoch 00001: val_loss improved from inf to 10746.61914, saving model to fold-0.h5
Epoch 2/50
Epoch 00002: val_loss improved from 10746.61914 to 4.47555, saving model to fold-0.h5
Epoch 3/50
Epoch 00003: val_loss did not improve from 4.47555
Epoch 4/50
Epoch 00004: val_loss improved from 4.47555 to 4.07307, saving model to fold-0.h5
Epoch 5/50
Epoch 00005: val_loss did not improve from 4.07307
Epoch 6/50
Epoch 00006: val_loss improved from 4.07307 to 3.93673, saving model to fold-0.h5
Epoch 7/50
Epoch 00007: val_loss did not improve from 3.93673
Epoch 8/50
Epoch 00008: val_loss did not improve from 3.93673
Epoch 9/50
Epoch 00009: val_loss did not improve from 3.93673
Epoch 10/50
Epoch 00010: val_loss improved from 3.93673 to 3.90206, saving model to fold-0.h5
Epoch 11/50
Epoch 00011: val_loss did not improve from 3.90206
Epoch 12/50
Epoch 00012: val_loss did not improve from 3.90206
Epoch 13/50
Epoc

# Evaluation

In [10]:
mean_val_loss = np.mean([np.min(h['val_loss']) for h in folds_history])
print('Our mean CV MAE is: ' + str(mean_val_loss))

Our mean CV MAE is: 3.703711175918579
