In [3]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [4]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from sklearn import model_selection
from tqdm.autonotebook import tqdm

In [142]:
DATA_DIR = CONFIG.CFG.DATA.BASE
K_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 5e-4
NUM_EPOCHS = 1100
# print loss every
PRINT_EVERY = 50

QUANTILES = [0.2, 0.5, 0.8]
SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age'] #'Percent'
SCALE_COLUMNS = ['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [143]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(2020)

In [144]:
kf = model_selection.KFold(K_FOLDS)
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [145]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
# remove the duplicates from the train_df
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])

In [146]:
# extract the Patient and weeks from the Patient_Week column
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [147]:
# merge the sub_df with the test_df
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker


In [148]:
train_df['FROM'] = 'train'
test_df['FROM'] = 'val'
sub_df['FROM'] = 'test'

In [149]:
sub_df.shape

(730, 10)

In [150]:
test_df.shape

(5, 8)

In [151]:
combined_df = train_df.append([test_df, sub_df])

In [152]:
# initialize base_week column
combined_df['Base_Week'] = combined_df['Weeks']
# make the weeks from sub_df to be np.nan so that when we calculate the base_week it comes from the test_df
combined_df.loc[combined_df['FROM'] == 'test', 'Base_Week'] = np.nan
# now calculate the min for each patient group and set it to the Base_Week column
combined_df['Base_Week'] = combined_df.groupby('Patient')['Base_Week'].transform('min')

In [153]:
# get the base_df (where the Base_Week == the min_week we calculated) so that we can get the base_fvc, base_age and base_percentage
base_df = combined_df[combined_df['Weeks'] == combined_df['Base_Week']]

In [154]:
base_df.rename(columns={
    'FVC': 'Base_FVC',
    'Percent': 'Base_Percent',
    'Age': 'Base_Age'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [155]:
combined_df = combined_df.merge(base_df[['Patient', 'Base_FVC', 'Base_Percent', 'Base_Age']], on='Patient', how='left')

In [156]:
combined_df.tail()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age
3821,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,2925,71.824968,73
3822,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,2925,71.824968,73
3823,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73
3824,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73
3825,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73


In [157]:
combined_df['Weeks_Passed'] = combined_df['Weeks'] - combined_df['Base_Week']

In [158]:
combined_df.tail()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age,Weeks_Passed
3821,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,2925,71.824968,73,132.0
3822,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,2925,71.824968,73,132.0
3823,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73,133.0
3824,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73,133.0
3825,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73,133.0


In [159]:
MIN_MAX_SCALER.fit(combined_df[combined_df['FROM'] == 'train'][['Weeks_Passed', 'FVC', 'Percent', 'Age']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [160]:
combined_df.tail()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age,Weeks_Passed
3821,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,2925,71.824968,73,132.0
3822,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,2925,71.824968,73,132.0
3823,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73,133.0
3824,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73,133.0
3825,ID00426637202313170790466,133,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_133,100.0,0.0,2925,71.824968,73,133.0


In [161]:
combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']] = MIN_MAX_SCALER.transform(combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']])

In [162]:
# convert categoricals into dummies
combined_df['Sex'] = pd.Categorical(combined_df['Sex'], categories=SEX_COLUMNS)
combined_df['SmokingStatus'] = pd.Categorical(combined_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
combined_df = combined_df.join(pd.get_dummies(combined_df['Sex']))
combined_df = combined_df.join(pd.get_dummies(combined_df['SmokingStatus']))

In [163]:
combined_df.drop_duplicates(inplace=True)

In [164]:
combined_df.reset_index(drop=True)

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age,Weeks_Passed,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.000000,1,0,0,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.142857,1,0,0,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.174603,1,0,0,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.206349,1,0,0,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.238095,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2265,ID00426637202313170790466,129,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_129,100.0,0.0,0.376525,0.345604,0.615385,2.047619,1,0,0,0,1
2266,ID00426637202313170790466,130,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_130,100.0,0.0,0.376525,0.345604,0.615385,2.063492,1,0,0,0,1
2267,ID00426637202313170790466,131,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_131,100.0,0.0,0.376525,0.345604,0.615385,2.079365,1,0,0,0,1
2268,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,0.376525,0.345604,0.615385,2.095238,1,0,0,0,1


In [165]:
class PulmonaryDataset(torch.utils.data.Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [166]:
class PulmonaryModel(nn.Module):
    def __init__(self, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [167]:
clip_one, clip_two = torch.tensor(70, dtype=torch.float32).to(DEVICE), torch.tensor(1000, dtype=torch.float32).to(DEVICE)
def score(y_true, y_pred):
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]

    sigma_clip = torch.max(sigma, clip_one)
    delta = torch.abs(y_true - fvc_pred)
    delta = torch.min(delta, clip_two)
    sq2 = torch.sqrt(torch.tensor(2, dtype=torch.float32))
    metric = (delta / sigma_clip) * sq2 + torch.log(sigma_clip * sq2)
    return torch.mean(metric)

In [168]:
def quantile_loss(preds, target, quantiles, _lambda):
    assert not target.requires_grad
    assert preds.size(0) == target.size(0)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    # return loss
    return _lambda * loss + (1 - _lambda) * score(target, preds)

In [169]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [170]:
def train_one_epoch(model, train_data_loader, optimizer, train_loss):
    model.train()
    for i, data in enumerate(train_data_loader):
        features = data['features']
        targets = data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        model.zero_grad()
        out = model(features)
        loss = quantile_loss(out, targets, QUANTILES, 0.8)
        train_loss.update(loss, features.size(0))
        loss.backward()
        optimizer.step()

In [171]:
def eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler):
    model.eval()

    with torch.no_grad():
        for i, data in enumerate(valid_data_loader):
            features = data['features']
            targets = data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()
            
            out = model(features)
            loss = quantile_loss(out, targets, QUANTILES, 0.8)
            valid_loss.update(loss, features.size(0))
    
    if lr_scheduler is not None:
        lr_scheduler.step(valid_loss.avg)

In [172]:
new_train_df = combined_df[combined_df['FROM'] == 'train'].reset_index(drop=True)
TRAIN_PATIENTS = new_train_df['Patient'].unique().tolist()

In [173]:
for fold, (train_index, test_index) in enumerate(kf.split(TRAIN_PATIENTS)):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)

    df_train = new_train_df.iloc[train_index].reset_index(drop=True)
    df_valid = new_train_df.iloc[test_index].reset_index(drop=True)

    train_dataset = PulmonaryDataset(df_train, FV)
    valid_dataset = PulmonaryDataset(df_valid, FV)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=10,
        shuffle=True,
        num_workers=4
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=4,
        shuffle=False,
        num_workers=4
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=50, factor=0.7, verbose=True)

    best_valid_loss = float('inf')

    
    # tq = tqdm(range(NUM_EPOCHS), desc=f"Fold {fold}")
    for epoch in range(NUM_EPOCHS):
        train_loss = AverageMeter()
        valid_loss = AverageMeter()

        train_one_epoch(model, train_data_loader, optimizer, train_loss)
        eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler)

        if epoch % PRINT_EVERY == 0:
            print(f"Epoch {epoch}/{NUM_EPOCHS}, Loss {train_loss.avg}")
            print(f"Fold {fold}, Valid Loss {valid_loss.avg} \n")
        
        # tq.set_postfix(val_loss=valid_loss.avg.item())

        if valid_loss.avg < best_valid_loss:
            best_valid_loss = valid_loss.avg
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, f"model_fold_{fold}.pt"))

Epoch 0/1100, Loss 3412.908447265625
Fold 0, Valid Loss 3645.0712890625 

Epoch 50/1100, Loss 523.5621337890625
Fold 0, Valid Loss 499.52630615234375 

Epoch 100/1100, Loss 336.97406005859375
Fold 0, Valid Loss 436.2543029785156 

Epoch 150/1100, Loss 250.77784729003906
Fold 0, Valid Loss 377.6388854980469 

Epoch 200/1100, Loss 212.69961547851562
Fold 0, Valid Loss 339.52288818359375 

Epoch 250/1100, Loss 188.67098999023438
Fold 0, Valid Loss 311.8796691894531 

Epoch 300/1100, Loss 176.89234924316406
Fold 0, Valid Loss 290.4786682128906 

Epoch 350/1100, Loss 168.71542358398438
Fold 0, Valid Loss 272.6313781738281 

Epoch 400/1100, Loss 161.38421630859375
Fold 0, Valid Loss 263.6256408691406 

Epoch 450/1100, Loss 156.77923583984375
Fold 0, Valid Loss 257.1876220703125 

Epoch 500/1100, Loss 153.37620544433594
Fold 0, Valid Loss 252.1453399658203 

Epoch 550/1100, Loss 149.85556030273438
Fold 0, Valid Loss 249.04139709472656 

Epoch 600/1100, Loss 146.71820068359375
Fold 0, Valid Lo

In [174]:
CONFIG.upload_to_kaggle("osicqrmodel", "OSIC QR Model", new=False)

In [39]:
models = []
for fold in range(K_FOLDS):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)
    checkpoint = torch.load(os.path.join(CONFIG.CFG.DATA.MODELS_OUT, f"model_fold_{fold}.pt"))
    model.load_state_dict(checkpoint['model_state_dict'])
    models.append(model)

In [134]:
new_test_df = combined_df[combined_df['FROM'] == 'test'].reset_index(drop=True)
test_dataset = PulmonaryDataset(new_test_df, FV)
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4
)

In [135]:
new_test_df.shape

(730, 20)

In [136]:
avg_preds = np.zeros((len(test_dataset), len(QUANTILES)))
with torch.no_grad():
    for model in models:
        preds = []
        for j, test_data in enumerate(test_data_loader):
            features = test_data['features']
            targets = test_data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()

            out = model(features)
            preds.append(out)
        preds = torch.cat(preds, dim=0).cpu().numpy()
        avg_preds += preds
avg_preds /= len(models)

In [141]:
avg_preds

array([[2898.08457031, 3142.53564453, 3377.50253906],
       [2892.51005859, 3136.5050293 , 3371.03310547],
       [2886.93530273, 3130.47456055, 3364.56391602],
       ...,
       [2028.98952637, 2202.49018555, 2369.36118164],
       [2023.7012207 , 2196.76708984, 2363.21899414],
       [2018.41291504, 2191.04406738, 2357.07675781]])

In [57]:
sub_df.shape

(730, 10)

In [138]:
temp_df = new_test_df

In [139]:
temp_df['FVC'] = avg_preds[:, 1]
temp_df['Confidence'] = np.abs(avg_preds[:, 2] - avg_preds[:, 0])

In [140]:
temp_df.to_csv("view.csv", index=False)

In [48]:
# # inverse the scaling operation for FVC
# avg_preds -= MIN_MAX_SCALER.min_[SCALE_COLUMNS.index('FVC')]
# avg_preds /= MIN_MAX_SCALER.scale_[SCALE_COLUMNS.index('FVC')]