In [1]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
from sklearn import preprocessing

In [48]:
DATA_DIR = CONFIG.CFG.DATA.BASE
K_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-3
NUM_EPOCHS = 1000
ES_PATIENCE = 20
QUANTILES = (0.2, 0.5, 0.8)
SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age']
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [49]:
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [50]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [51]:
train_df[SCALE_COLUMNS] = MIN_MAX_SCALER.fit_transform(train_df[SCALE_COLUMNS])

In [52]:
# convert categoricals into dummies
train_df['Sex'] = pd.Categorical(train_df['Sex'], categories=SEX_COLUMNS)
train_df['SmokingStatus'] = pd.Categorical(train_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
train_df = train_df.join(pd.get_dummies(train_df['Sex']))
train_df = train_df.join(pd.get_dummies(train_df['SmokingStatus']))

In [53]:
# REMOVE THE ONES FROM THE TRAIN_DF THAT ARE PRESENT IN TEST_DF AS WELL
TEST_PATIENTS = test_df['Patient'].unique().tolist()
valid_df = train_df[train_df['Patient'].isin(TEST_PATIENTS)]
train_df = train_df[~train_df['Patient'].isin(TEST_PATIENTS)]

In [54]:
valid_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
1504,ID00419637202311204720264,0.07971,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
1505,ID00419637202311204720264,0.086957,0.364681,0.302311,0.615385,Male,Ex-smoker,1,0,0,1,0
1506,ID00419637202311204720264,0.101449,0.351041,0.288097,0.615385,Male,Ex-smoker,1,0,0,1,0
1507,ID00419637202311204720264,0.108696,0.339555,0.276128,0.615385,Male,Ex-smoker,1,0,0,1,0
1508,ID00419637202311204720264,0.130435,0.342965,0.279682,0.615385,Male,Ex-smoker,1,0,0,1,0


In [55]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,0.007246,0.26705,0.236393,0.769231,Male,Ex-smoker,1,0,0,1,0
1,ID00007637202177411956430,0.072464,0.248923,0.215941,0.769231,Male,Ex-smoker,1,0,0,1,0
2,ID00007637202177411956430,0.086957,0.221464,0.18496,0.769231,Male,Ex-smoker,1,0,0,1,0
3,ID00007637202177411956430,0.101449,0.23636,0.201767,0.769231,Male,Ex-smoker,1,0,0,1,0
4,ID00007637202177411956430,0.115942,0.2229,0.18658,0.769231,Male,Ex-smoker,1,0,0,1,0


In [56]:
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [57]:
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')

In [58]:
# have to make it categorical coz sub's sex column has males only
sub_df['Sex'] = pd.Categorical(sub_df['Sex'], categories=SEX_COLUMNS)
sub_df['SmokingStatus'] = pd.Categorical(sub_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)

In [59]:
sub_df = sub_df.join(pd.get_dummies(sub_df['Sex']))
sub_df = sub_df.join(pd.get_dummies(sub_df['SmokingStatus']))

In [60]:
sub_df[SCALE_COLUMNS] = MIN_MAX_SCALER.transform(sub_df[SCALE_COLUMNS])
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-0.050725,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-0.043478,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-0.036232,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-0.028986,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-0.021739,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0


In [61]:
class PulmonaryDataset(Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [62]:
class PulmonaryModel(nn.Module):
    def __init__(self, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [63]:
def quantile_loss(preds, target, quantiles):
    assert not target.requires_grad
    assert preds.size(0) == target.size(0)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    return loss

In [64]:
train_dataset = PulmonaryDataset(train_df, FV)
valid_dataset = PulmonaryDataset(valid_df, FV)
test_dataset = PulmonaryDataset(sub_df, FV)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=10,
    drop_last=False,
    shuffle=True,
    num_workers=2
)
valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=4,
    shuffle=False,
    drop_last=False,
    num_workers=2
)
test_data_loader = DataLoader(
    test_dataset,
    batch_size=10,
    drop_last=False,
    shuffle=False,
    num_workers=2
)

In [74]:
model = PulmonaryModel(len(FV))
model.to(DEVICE)

PulmonaryModel(
  (fc1): Linear(in_features=9, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=3, bias=True)
)

In [75]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.05, verbose=True)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

In [76]:
lowest_val_score = float('inf')
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for i, data in enumerate(train_data_loader):
        features = data['features']
        targets = data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        model.zero_grad()
        out = model(features)
        loss = quantile_loss(out, targets, QUANTILES)
        total_loss += loss
        loss.backward()
        optimizer.step()
        # scheduler.step()
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for i, data in enumerate(valid_data_loader):
            features = data['features']
            targets = data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()

            out = model(features)
            loss = quantile_loss(out, targets, QUANTILES)
            val_loss += loss
    
    if val_loss < lowest_val_score:
        lowest_val_score = val_loss
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, "best_model.pt"))

    if lr_scheduler is not None:
        lr_scheduler.step(val_loss)

    print(f"Epoch {epoch}, Loss {total_loss/len(train_data_loader)} \n Val Loss, {val_loss}")

Epoch 0, Loss 0.08648233115673065 
 Val Loss, 0.5228700637817383
Epoch 1, Loss 0.02328229509294033 
 Val Loss, 0.30478763580322266
Epoch 2, Loss 0.0176162701100111 
 Val Loss, 0.24272899329662323
Epoch 3, Loss 0.011500070802867413 
 Val Loss, 0.08962269127368927
Epoch 4, Loss 0.008095852099359035 
 Val Loss, 0.06807706505060196
Epoch 5, Loss 0.007904370315372944 
 Val Loss, 0.09128690510988235
Epoch 6, Loss 0.007913289591670036 
 Val Loss, 0.03876123204827309
Epoch 7, Loss 0.0073406752198934555 
 Val Loss, 0.11662084609270096
Epoch 8, Loss 0.006621276494115591 
 Val Loss, 0.0756915807723999
Epoch 9, Loss 0.005653091240674257 
 Val Loss, 0.027989469468593597
Epoch 10, Loss 0.005888788960874081 
 Val Loss, 0.04633215069770813
Epoch 11, Loss 0.005700851324945688 
 Val Loss, 0.025716617703437805
Epoch 12, Loss 0.004757427144795656 
 Val Loss, 0.036921191960573196
Epoch 13, Loss 0.004415890201926231 
 Val Loss, 0.03669958934187889
Epoch 14, Loss 0.004617221653461456 
 Val Loss, 0.0263845305

KeyboardInterrupt: 

In [81]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, "model.pt"))

In [82]:
CONFIG.upload_to_kaggle("osicqrmodel", "OSIC QR Model", new=False)

In [83]:
preds = []
with torch.no_grad():
    for j, test_data in enumerate(test_data_loader):
        features = test_data['features']
        targets = test_data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        out = model(features)
        for ou in out.cpu().numpy().tolist():
            preds.append(ou)

In [84]:
FVC_MIN = MIN_MAX_SCALER.data_min_[1]
FVC_MAX = MIN_MAX_SCALER.data_max_[1]

In [85]:
inv_preds = []
for a in preds:
    b = []
    for val in a:
        z = (FVC_MAX - FVC_MIN) * val + FVC_MIN
        b.append(z)
    inv_preds.append(b)

In [86]:
inv_preds

[[3013.62531542778, 3013.371910095215, 3014.1483998298645],
 [3014.3325587511063, 3014.234916329384, 3014.8511595726013],
 [3015.0399681329727, 3015.097922563553, 3015.5540853738785],
 [3015.747211456299, 3015.9607627391815, 3016.256679058075],
 [3016.4459857940674, 3016.8169605731964, 3016.9620957374573],
 [3016.9790337085724, 3017.5426363945007, 3017.7156693935394],
 [3017.5122476816177, 3018.2681461572647, 3018.468910932541],
 [3018.0192244052887, 3018.9631011486053, 3019.1942546367645],
 [3017.945162296295, 3018.9700756073, 3019.3018605709076],
 [3017.868443250656, 3018.9737288951874, 3019.4074738025665],
 [3017.792056322098, 3018.9772161245346, 3019.5130870342255],
 [3017.758844614029, 3019.004449725151, 3019.615545153618],
 [3017.8767461776733, 3019.1142144203186, 3019.706711292267],
 [3017.9946477413177, 3019.223813056946, 3019.797545313835],
 [3018.112549304962, 3019.333411693573, 3019.888545393944],
 [3018.2304508686066, 3019.4430103302, 3019.979379415512],
 [3018.348352432251