In [1]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
from sklearn import preprocessing

In [3]:
DATA_DIR = CONFIG.CFG.DATA.BASE
K_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-3
NUM_EPOCHS = 1000
ES_PATIENCE = 20
QUANTILES = (0.2, 0.5, 0.8)
SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age']
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [45]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [46]:
train_df['Weeks'] = train_df['Weeks'] + 13
train_df[SCALE_COLUMNS] = MIN_MAX_SCALER.fit_transform(train_df[SCALE_COLUMNS])

In [47]:
# convert categoricals into dummies
train_df['Sex'] = pd.Categorical(train_df['Sex'], categories=SEX_COLUMNS)
train_df['SmokingStatus'] = pd.Categorical(train_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
train_df = train_df.join(pd.get_dummies(train_df['Sex']))
train_df = train_df.join(pd.get_dummies(train_df['SmokingStatus']))

In [48]:
# REMOVE THE ONES FROM THE TRAIN_DF THAT ARE PRESENT IN TEST_DF AS WELL
TEST_PATIENTS = test_df['Patient'].unique().tolist()
valid_df = train_df[train_df['Patient'].isin(TEST_PATIENTS)]
train_df = train_df[~train_df['Patient'].isin(TEST_PATIENTS)]

In [49]:
valid_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
1504,ID00419637202311204720264,0.07971,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
1505,ID00419637202311204720264,0.086957,0.364681,0.302311,0.615385,Male,Ex-smoker,1,0,0,1,0
1506,ID00419637202311204720264,0.101449,0.351041,0.288097,0.615385,Male,Ex-smoker,1,0,0,1,0
1507,ID00419637202311204720264,0.108696,0.339555,0.276128,0.615385,Male,Ex-smoker,1,0,0,1,0
1508,ID00419637202311204720264,0.130435,0.342965,0.279682,0.615385,Male,Ex-smoker,1,0,0,1,0


In [50]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,0.007246,0.26705,0.236393,0.769231,Male,Ex-smoker,1,0,0,1,0
1,ID00007637202177411956430,0.072464,0.248923,0.215941,0.769231,Male,Ex-smoker,1,0,0,1,0
2,ID00007637202177411956430,0.086957,0.221464,0.18496,0.769231,Male,Ex-smoker,1,0,0,1,0
3,ID00007637202177411956430,0.101449,0.23636,0.201767,0.769231,Male,Ex-smoker,1,0,0,1,0
4,ID00007637202177411956430,0.115942,0.2229,0.18658,0.769231,Male,Ex-smoker,1,0,0,1,0


In [51]:
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [52]:
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')
sub_df['Weeks'] = sub_df['Weeks'] + 13

In [53]:
# have to make it categorical coz sub's sex column has males only
sub_df['Sex'] = pd.Categorical(sub_df['Sex'], categories=SEX_COLUMNS)
sub_df['SmokingStatus'] = pd.Categorical(sub_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)

In [54]:
sub_df = sub_df.join(pd.get_dummies(sub_df['Sex']))
sub_df = sub_df.join(pd.get_dummies(sub_df['SmokingStatus']))

In [55]:
sub_df[SCALE_COLUMNS] = MIN_MAX_SCALER.transform(sub_df[SCALE_COLUMNS])
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-0.050725,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-0.043478,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-0.036232,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-0.028986,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-0.021739,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0


In [56]:
class PulmonaryDataset(Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [57]:
class PulmonaryModel(nn.Module):
    def __init__(self, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [58]:
def quantile_loss(preds, target, quantiles):
    assert not target.requires_grad
    assert preds.size(0) == target.size(0)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    return loss

In [59]:
train_dataset = PulmonaryDataset(train_df, FV)
valid_dataset = PulmonaryDataset(valid_df, FV)
test_dataset = PulmonaryDataset(sub_df, FV)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=10,
    drop_last=False,
    num_workers=2
)
valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=4,
    drop_last=False,
    num_workers=2
)
test_data_loader = DataLoader(
    test_dataset,
    batch_size=10,
    drop_last=False,
    num_workers=2
)

In [60]:
model = PulmonaryModel(len(FV))
model.to(DEVICE)

PulmonaryModel(
  (fc1): Linear(in_features=9, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=3, bias=True)
)

In [61]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

In [29]:
lowest_val_score = float('inf')
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for i, data in enumerate(train_data_loader):
        features = data['features']
        targets = data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        model.zero_grad()
        out = model(features)
        loss = quantile_loss(out, targets, QUANTILES)
        total_loss += loss
        loss.backward()
        optimizer.step()
        # scheduler.step()
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for i, data in enumerate(valid_data_loader):
            features = data['features']
            targets = data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()

            out = model(features)
            loss = quantile_loss(out, targets, QUANTILES)
            val_loss += loss
    
    if val_loss < lowest_val_score:
        lowest_val_score = val_loss
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, "best_model.pt"))

    print(f"Epoch {epoch}, Loss {total_loss/len(train_data_loader)} \n Val Loss, {val_loss}")

Epoch 0, Loss 0.5159505605697632
Epoch 1, Loss 0.4561516344547272
Epoch 2, Loss 0.3952631950378418
Epoch 3, Loss 0.335001140832901
Epoch 4, Loss 0.2765418291091919
Epoch 5, Loss 0.22670376300811768
Epoch 6, Loss 0.1882045865058899
Epoch 7, Loss 0.1611226350069046
Epoch 8, Loss 0.14424265921115875
Epoch 9, Loss 0.1341153234243393
Epoch 10, Loss 0.12787394225597382
Epoch 11, Loss 0.12322951853275299
Epoch 12, Loss 0.11960865557193756
Epoch 13, Loss 0.11661361157894135
Epoch 14, Loss 0.11418834328651428
Epoch 15, Loss 0.11215799301862717
Epoch 16, Loss 0.11032843589782715
Epoch 17, Loss 0.10858705639839172
Epoch 18, Loss 0.10693690925836563
Epoch 19, Loss 0.10538594424724579
Epoch 20, Loss 0.10389021039009094
Epoch 21, Loss 0.10242750495672226
Epoch 22, Loss 0.10097720474004745
Epoch 23, Loss 0.09956229478120804
Epoch 24, Loss 0.09818651527166367
Epoch 25, Loss 0.09683026373386383
Epoch 26, Loss 0.09546288102865219
Epoch 27, Loss 0.09407874196767807
Epoch 28, Loss 0.09270090609788895
Epoc

In [30]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, "model.pt"))

In [31]:
CONFIG.upload_to_kaggle("osicqrmodel", "OSIC QR Model", new=True)

In [32]:
preds = []
with torch.no_grad():
    for j, test_data in enumerate(test_data_loader):
        features = test_data['features']
        targets = test_data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        out = model(features)
        for ou in out.cpu().numpy().tolist():
            preds.append(ou)

In [33]:
FVC_MIN = MIN_MAX_SCALER.data_min_[1]
FVC_MAX = MIN_MAX_SCALER.data_max_[1]

In [34]:
inv_preds = []
for a in preds:
    b = []
    for val in a:
        z = (FVC_MAX - FVC_MIN) * val + FVC_MIN
        b.append(z)
    inv_preds.append(b)

In [35]:
inv_preds

[[3018.124007344246, 3021.826946735382, 3019.9815381765366],
 [3018.117198944092, 3021.8199722766876, 3019.98319876194],
 [3018.110556602478, 3021.8126657009125, 3019.9845272302628],
 [3018.103748202324, 3021.8053591251373, 3019.9858556985855],
 [3018.09693980217, 3021.7982186079025, 3019.9871841669083],
 [3018.0901314020157, 3021.7909120321274, 3019.9888447523117],
 [3018.083489060402, 3021.783937573433, 3019.9901732206345],
 [3018.0765146017075, 3021.776630997658, 3019.9916677474976],
 [3018.0698722600937, 3021.769490480423, 3019.9931622743607],
 [3018.0630638599396, 3021.762183904648, 3019.9946568012238],
 [3018.056421518326, 3021.755043387413, 3019.996151328087],
 [3018.0496131181717, 3021.747736811638, 3019.9974797964096],
 [3018.042970776558, 3021.740596294403, 3019.9989743232727],
 [3018.036328434944, 3021.733289718628, 3020.000468850136],
 [3018.0291879177094, 3021.725983142853, 3020.0017973184586],
 [3018.022711634636, 3021.7186765670776, 3020.003457903862],
 [3018.01573717594