In [1]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [9]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from sklearn import model_selection
from tqdm.autonotebook import tqdm

In [3]:
DATA_DIR = CONFIG.CFG.DATA.BASE
K_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-3
NUM_EPOCHS = 1100
# print loss every
PRINT_EVERY = 50

QUANTILES = [0.2, 0.5, 0.8]
SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age'] #'Percent'
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [138]:
kf = model_selection.KFold(K_FOLDS)
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [139]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
# remove the duplicates from the train_df
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])

In [140]:
# extract the Patient and weeks from the Patient_Week column
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [141]:
# merge the sub_df with the test_df
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker


In [142]:
train_df['FROM'] = 'train'
test_df['FROM'] = 'val'
sub_df['FROM'] = 'test'

In [143]:
combined_df = train_df.append([test_df, sub_df])

In [144]:
# initialize base_week column
combined_df['Base_Week'] = combined_df['Weeks']
# make the weeks from sub_df to be np.nan so that when we calculate the base_week it comes from the test_df
combined_df.loc[combined_df['FROM'] == 'test', 'Base_Week'] = np.nan
# now calculate the min for each patient group and set it to the Base_Week column
combined_df['Base_Week'] = combined_df.groupby('Patient')['Base_Week'].transform('min')

In [145]:
# get the base_df (where the Base_Week == the min_week we calculated) so that we can get the base_fvc, base_age and base_percentage
base_df = combined_df[combined_df['Weeks'] == combined_df['Base_Week']]

In [146]:
base_df.rename(columns={
    'FVC': 'Base_FVC',
    'Percent': 'Base_Percent',
    'Age': 'Base_Age'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [147]:
combined_df = combined_df.merge(base_df[['Patient', 'Base_FVC', 'Base_Percent', 'Base_Age']], on='Patient', how='left')

In [148]:
combined_df['Weeks_Passed'] = combined_df['Weeks'] - combined_df['Base_Week']

In [149]:
combined_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age,Weeks_Passed
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,,,-4.0,2315,58.253649,79,0.0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,,,-4.0,2315,58.253649,79,9.0
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,,,-4.0,2315,58.253649,79,11.0
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,,,-4.0,2315,58.253649,79,13.0
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,,,-4.0,2315,58.253649,79,15.0


In [150]:
MIN_MAX_SCALER.fit(combined_df[combined_df['FROM'] == 'train'][['Weeks_Passed', 'FVC', 'Percent', 'Age']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [151]:
combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']] = MIN_MAX_SCALER.transform(combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']])

In [152]:
combined_df.tail()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age,Weeks_Passed
3821,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,0.376525,0.345604,0.615385,2.095238
3822,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,0.376525,0.345604,0.615385,2.095238
3823,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,0.376525,0.345604,0.615385,2.111111
3824,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,0.376525,0.345604,0.615385,2.111111
3825,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,0.376525,0.345604,0.615385,2.111111


In [153]:
train_df['Base_Week'] = train_df['Weeks']
train_df['Base_Week'] = train_df.groupby('Patient')['Base_Week'].transform('min')
train_df['Weeks_Passed'] = train_df['Weeks'] - train_df['Base_Week']
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Base_Week,Weeks_Passed
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,-4,0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,-4,9
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,-4,11
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,-4,13
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,-4,15


In [54]:
# fit the min_max_scaler
MIN_MAX_SCALER.fit_transform(train_df[['Weeks_Passed', 'FVC', 'Percent', 'Age']])

array([[0.        , 0.26704953, 0.23639327, 0.76923077],
       [0.14285714, 0.24892319, 0.21594131, 0.76923077],
       [0.17460317, 0.22146447, 0.18495963, 0.76923077],
       ...,
       [0.49206349, 0.37347452, 0.3422443 , 0.61538462],
       [0.68253968, 0.38549892, 0.35548363, 0.61538462],
       [0.93650794, 0.3494257 , 0.31576566, 0.61538462]])

In [61]:
# get the data from the first time so we can add BASE_ columns for fvc percent and age columns
base_df = train_df[train_df['Weeks'] == train_df['Base_Week']]
# rename the columns before joining
base_df.rename(columns={
    'FVC': 'Base_FVC',
    'Percent': 'Base_Percent',
    'Age': 'Base_Age'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [62]:
base_df.head()

Unnamed: 0,Patient,Weeks,Base_FVC,Base_Percent,Base_Age,Sex,SmokingStatus,Base_Week,Weeks_Passed
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,-4,0
9,ID00009637202177434476278,8,3660,85.282878,69,Male,Ex-smoker,8,0
18,ID00010637202177584971671,0,3523,94.724672,60,Male,Ex-smoker,0,0
27,ID00011637202177653955184,6,3326,85.98759,72,Male,Ex-smoker,6,0
36,ID00012637202177665765362,33,3418,93.726006,65,Male,Never smoked,33,0


In [63]:
train_df = train_df.merge(base_df[['Patient', 'Base_FVC', 'Base_Percent', 'Base_Age']], on='Patient')

In [64]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Base_Week,Weeks_Passed,Base_FVC,Base_Percent,Base_Age
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,-4,0,2315,58.253649,79
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,-4,9,2315,58.253649,79
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,-4,11,2315,58.253649,79
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,-4,13,2315,58.253649,79
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,-4,15,2315,58.253649,79


In [65]:
train_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']] = MIN_MAX_SCALER.fit_transform(train_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']])

In [66]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Base_Week,Weeks_Passed,Base_FVC,Base_Percent,Base_Age
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,-4,0.0,0.241456,0.135886,0.769231
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,-4,0.142857,0.241456,0.135886,0.769231
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,-4,0.174603,0.241456,0.135886,0.769231
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,-4,0.206349,0.241456,0.135886,0.769231
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,-4,0.238095,0.241456,0.135886,0.769231


In [67]:
# convert categoricals into dummies
train_df['Sex'] = pd.Categorical(train_df['Sex'], categories=SEX_COLUMNS)
train_df['SmokingStatus'] = pd.Categorical(train_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
train_df = train_df.join(pd.get_dummies(train_df['Sex']))
train_df = train_df.join(pd.get_dummies(train_df['SmokingStatus']))

In [68]:
# REMOVE THE ONES FROM THE TRAIN_DF THAT ARE PRESENT IN TEST_DF AS WELL
TEST_PATIENTS = test_df['Patient'].unique().tolist()
valid_df = train_df[train_df['Patient'].isin(TEST_PATIENTS)]
train_df = train_df[~train_df['Patient'].isin(TEST_PATIENTS)]
TRAIN_PATIENTS = train_df['Patient'].unique().tolist()

In [69]:
test_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


In [70]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Base_Week,Weeks_Passed,Base_FVC,Base_Percent,Base_Age,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,-4,0.0,0.241456,0.135886,0.769231,1,0,0,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,-4,0.142857,0.241456,0.135886,0.769231,1,0,0,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,-4,0.174603,0.241456,0.135886,0.769231,1,0,0,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,-4,0.206349,0.241456,0.135886,0.769231,1,0,0,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,-4,0.238095,0.241456,0.135886,0.769231,1,0,0,1,0


Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [73]:
# have to make it categorical coz sub's sex column has males only
sub_df['Sex'] = pd.Categorical(sub_df['Sex'], categories=SEX_COLUMNS)
sub_df['SmokingStatus'] = pd.Categorical(sub_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)

In [74]:
sub_df = sub_df.join(pd.get_dummies(sub_df['Sex']))
sub_df = sub_df.join(pd.get_dummies(sub_df['SmokingStatus']))

In [76]:
sub_df.rename(columns={
    'FVC': 'Base_FVC',
    'Percent': 'Base_Percent',
    'Age': 'Base_Age'
}, inplace=True)

In [77]:
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,Base_FVC,Base_Percent,Base_Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker,1,0,0,1,0
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker,1,0,0,1,0
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker,1,0,0,1,0
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker,1,0,0,1,0
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker,1,0,0,1,0


In [16]:
sub_df[SCALE_COLUMNS] = MIN_MAX_SCALER.transform(sub_df[SCALE_COLUMNS])
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-0.050725,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-0.043478,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-0.036232,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-0.028986,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-0.021739,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0


In [17]:
class PulmonaryDataset(torch.utils.data.Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [18]:
class PulmonaryModel(nn.Module):
    def __init__(self, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [19]:
clip_one, clip_two = torch.tensor(70, dtype=torch.float32).to(DEVICE), torch.tensor(1000, dtype=torch.float32).to(DEVICE)
def score(y_true, y_pred):
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]

    sigma_clip = torch.max(sigma, clip_one)
    delta = torch.abs(y_true - fvc_pred)
    delta = torch.min(delta, clip_two)
    sq2 = torch.sqrt(torch.tensor(2, dtype=torch.float32))
    metric = (delta / sigma_clip) * sq2 + torch.log(sigma_clip * sq2)
    return torch.mean(metric)

In [20]:
def quantile_loss(preds, target, quantiles, _lambda):
    assert not target.requires_grad
    assert preds.size(0) == target.size(0)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    # return loss
    return _lambda * loss + (1 - _lambda) * score(target, preds)

In [21]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [22]:
def train_one_epoch(model, train_data_loader, optimizer, train_loss):
    model.train()
    for i, data in enumerate(train_data_loader):
        features = data['features']
        targets = data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        model.zero_grad()
        out = model(features)
        loss = quantile_loss(out, targets, QUANTILES, 0.8)
        train_loss.update(loss, features.size(0))
        loss.backward()
        optimizer.step()

In [23]:
def eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler):
    model.eval()

    with torch.no_grad():
        for i, data in enumerate(valid_data_loader):
            features = data['features']
            targets = data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()
            
            out = model(features)
            loss = quantile_loss(out, targets, QUANTILES, 0.8)
            valid_loss.update(loss, features.size(0))
    
    if lr_scheduler is not None:
        lr_scheduler.step(valid_loss.avg)

In [28]:
for fold, (train_index, test_index) in enumerate(kf.split(TRAIN_PATIENTS)):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)

    df_train = train_df.iloc[train_index].reset_index(drop=True)
    df_valid = train_df.iloc[test_index].reset_index(drop=True)

    train_dataset = PulmonaryDataset(df_train, FV)
    valid_dataset = PulmonaryDataset(df_valid, FV)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=10,
        shuffle=True,
        num_workers=4
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=4,
        shuffle=False,
        num_workers=4
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=50, factor=0.7, verbose=True)

    best_valid_loss = float('inf')

    
    # tq = tqdm(range(NUM_EPOCHS), desc=f"Fold {fold}")
    for epoch in range(NUM_EPOCHS):
        train_loss = AverageMeter()
        valid_loss = AverageMeter()

        train_one_epoch(model, train_data_loader, optimizer, train_loss)
        eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler)

        if epoch % PRINT_EVERY == 0:
            print(f"Epoch {epoch}/{NUM_EPOCHS}, Loss {train_loss.avg}")
            print(f"Fold {fold}, Valid Loss {valid_loss.avg} \n")
        
        # tq.set_postfix(val_loss=valid_loss.avg.item())

        if valid_loss.avg < best_valid_loss:
            best_valid_loss = valid_loss.avg
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, f"model_fold_{fold}.pt"))

Epoch 0/1100, Loss 1.1062694787979126
Fold 0, Valid Loss 1.0288283824920654 

Epoch 50/1100, Loss 0.9313132166862488
Fold 0, Valid Loss 0.9332516193389893 

Epoch 100/1100, Loss 0.9269670248031616
Fold 0, Valid Loss 0.9317992925643921 

Epoch   134: reducing learning rate of group 0 to 2.1000e-03.
Epoch 150/1100, Loss 0.9231132864952087
Fold 0, Valid Loss 0.9285225868225098 

Epoch   185: reducing learning rate of group 0 to 1.4700e-03.
Epoch 200/1100, Loss 0.9218205809593201
Fold 0, Valid Loss 0.9237077236175537 

Epoch   236: reducing learning rate of group 0 to 1.0290e-03.
Epoch 250/1100, Loss 0.9215016961097717
Fold 0, Valid Loss 0.9239925146102905 

Epoch   287: reducing learning rate of group 0 to 7.2030e-04.
Epoch 300/1100, Loss 0.9207387566566467
Fold 0, Valid Loss 0.9225426912307739 

Epoch 350/1100, Loss 0.92048180103302
Fold 0, Valid Loss 0.9231070280075073 

Epoch   368: reducing learning rate of group 0 to 5.0421e-04.
Epoch 400/1100, Loss 0.9200661778450012
Fold 0, Valid L

In [29]:
CONFIG.upload_to_kaggle("osicqrmodel", "OSIC QR Model", new=False)

In [31]:
models = []
for fold in range(K_FOLDS):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)
    checkpoint = torch.load(os.path.join(CONFIG.CFG.DATA.MODELS_OUT, f"model_fold_{fold}.pt"))
    model.load_state_dict(checkpoint['model_state_dict'])
    models.append(model)

In [32]:
test_dataset = PulmonaryDataset(sub_df, FV)
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4
)

In [33]:
avg_preds = np.zeros((len(test_dataset), len(QUANTILES)))
with torch.no_grad():
    for model in models:
        preds = []
        for j, test_data in enumerate(test_data_loader):
            features = test_data['features']
            targets = test_data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()

            out = model(features)
            preds.append(out)
        preds = torch.cat(preds, dim=0).cpu().numpy()
        avg_preds += preds
avg_preds /= len(models)

In [34]:
avg_preds

array([[0.38832836, 0.39305794, 0.39629872],
       [0.38851438, 0.39323218, 0.39649348],
       [0.38869387, 0.39340117, 0.39668609],
       ...,
       [0.37185209, 0.37842799, 0.38056577],
       [0.37173871, 0.37830186, 0.38043022],
       [0.37163655, 0.37818847, 0.38030535]])

In [35]:
# inverse the scaling operation for FVC
avg_preds -= MIN_MAX_SCALER.min_[SCALE_COLUMNS.index('FVC')]
avg_preds /= MIN_MAX_SCALER.scale_[SCALE_COLUMNS.index('FVC')]

In [36]:
avg_preds[:100]

array([[2990.76559713, 3017.11882179, 3035.17645922],
       [2991.80210133, 3018.08973286, 3036.26168499],
       [2992.8022387 , 3019.031318  , 3037.33488812],
       [2993.78653409, 3019.95898743, 3038.39929016],
       [2994.60254576, 3020.72235854, 3039.28617561],
       [2995.2199182 , 3021.29542656, 3039.96880906],
       [2995.85748336, 3021.88516686, 3040.66442828],
       [2996.53154819, 3022.50599332, 3041.3838603 ],
       [2997.20561302, 3023.12672014, 3042.10309305],
       [2997.87977748, 3023.74751339, 3042.82255828],
       [2998.55384231, 3024.36824021, 3043.54192388],
       [2999.20621989, 3024.96937213, 3044.2386723 ],
       [2999.58808811, 3025.31102097, 3044.66517706],
       [2999.82980292, 3025.51035764, 3044.95846965],
       [3000.07148452, 3025.70949504, 3045.25186188],
       [3000.31329896, 3025.90896456, 3045.54525411],
       [3000.55494735, 3026.1082016 , 3045.83851349],
       [3000.79672859, 3026.30763791, 3046.13190572],
       [3001.10842047, 3026.