In [1]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [88]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
from sklearn import preprocessing

In [119]:
DATA_DIR = CONFIG.CFG.DATA.BASE
K_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-3
NUM_EPOCHS = 1000
ES_PATIENCE = 20
QUANTILES = (0.2, 0.5, 0.8)
SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age']
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [98]:
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [99]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [100]:
train_df['Weeks'] = train_df['Weeks'] + 13

In [101]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,9,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,18,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,20,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,22,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,24,2069,52.063412,79,Male,Ex-smoker


In [102]:
train_df[SCALE_COLUMNS] = MIN_MAX_SCALER.fit_transform(train_df[SCALE_COLUMNS])

In [103]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,0.007246,0.26705,0.236393,0.769231,Male,Ex-smoker
1,ID00007637202177411956430,0.072464,0.248923,0.215941,0.769231,Male,Ex-smoker
2,ID00007637202177411956430,0.086957,0.221464,0.18496,0.769231,Male,Ex-smoker
3,ID00007637202177411956430,0.101449,0.23636,0.201767,0.769231,Male,Ex-smoker
4,ID00007637202177411956430,0.115942,0.2229,0.18658,0.769231,Male,Ex-smoker


In [None]:
# REMOVE THE ONES FROM THE TRAIN_DF THAT ARE PRESENT IN TEST_DF AS WELL

In [104]:
# convert categoricals into dummies
train_df['Sex'] = pd.Categorical(train_df['Sex'], categories=SEX_COLUMNS)
train_df['SmokingStatus'] = pd.Categorical(train_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
train_df = train_df.join(pd.get_dummies(train_df['Sex']))
train_df = train_df.join(pd.get_dummies(train_df['SmokingStatus']))

In [105]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,0.007246,0.26705,0.236393,0.769231,Male,Ex-smoker,1,0,0,1,0
1,ID00007637202177411956430,0.072464,0.248923,0.215941,0.769231,Male,Ex-smoker,1,0,0,1,0
2,ID00007637202177411956430,0.086957,0.221464,0.18496,0.769231,Male,Ex-smoker,1,0,0,1,0
3,ID00007637202177411956430,0.101449,0.23636,0.201767,0.769231,Male,Ex-smoker,1,0,0,1,0
4,ID00007637202177411956430,0.115942,0.2229,0.18658,0.769231,Male,Ex-smoker,1,0,0,1,0


In [77]:
test_df

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


In [78]:
sub_df

Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,2000,100
1,ID00421637202311550012437_-12,2000,100
2,ID00422637202311677017371_-12,2000,100
3,ID00423637202312137826377_-12,2000,100
4,ID00426637202313170790466_-12,2000,100
...,...,...,...
725,ID00419637202311204720264_133,2000,100
726,ID00421637202311550012437_133,2000,100
727,ID00422637202311677017371_133,2000,100
728,ID00423637202312137826377_133,2000,100


In [79]:
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [80]:
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker


In [81]:
sub_df['Weeks'] = sub_df['Weeks'] + 13

In [82]:
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,1,3020,70.186855,73,Male,Ex-smoker
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,2,3020,70.186855,73,Male,Ex-smoker
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,3,3020,70.186855,73,Male,Ex-smoker
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,4,3020,70.186855,73,Male,Ex-smoker
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,5,3020,70.186855,73,Male,Ex-smoker


In [84]:
# have to make it categorical coz sub's sex column has males only
sub_df['Sex'] = pd.Categorical(sub_df['Sex'], categories=["Male", "Female"])
sub_df['SmokingStatus'] = pd.Categorical(sub_df['SmokingStatus'], categories=['Ex-smoker', 'Never smoked', 'Currently Smokes'])

In [85]:
sub_df = sub_df.join(pd.get_dummies(sub_df['Sex']))
sub_df = sub_df.join(pd.get_dummies(sub_df['SmokingStatus']))

In [86]:
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Ex-smoker,Never smoked,Currently Smokes
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,1,3020,70.186855,73,Male,Ex-smoker,1,0,1,0,0
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,2,3020,70.186855,73,Male,Ex-smoker,1,0,1,0,0
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,3,3020,70.186855,73,Male,Ex-smoker,1,0,1,0,0
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,4,3020,70.186855,73,Male,Ex-smoker,1,0,1,0,0
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,5,3020,70.186855,73,Male,Ex-smoker,1,0,1,0,0


In [108]:
class PulmonaryDataset(Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [109]:
class PulmonaryModel(nn.Module):
    def __init__(self, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [110]:
def quantile_loss(preds, target, quantiles):
    assert not target.requires_grad
    assert preds.size(0) == target.size(0)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    return loss

In [114]:
train_dataset = PulmonaryDataset(train_df, FV)
train_data_loader = DataLoader(
    train_dataset,
    batch_size=10,
    drop_last=False,
    num_workers=2
)

In [132]:
model = PulmonaryModel(len(FV))
model.to(DEVICE)

PulmonaryModel(
  (fc1): Linear(in_features=9, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=3, bias=True)
)

In [133]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

In [134]:
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for i, data in enumerate(train_data_loader):
        features = data['features']
        targets = data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        model.zero_grad()
        out = model(features)
        loss = quantile_loss(out, targets, QUANTILES)
        total_loss += loss
        loss.backward()
        optimizer.step()
        # scheduler.step()
    print(f"Epoch {epoch}, Loss {total_loss/len(train_data_loader)}")

Epoch 0, Loss 0.4136742949485779
Epoch 1, Loss 0.348447322845459
Epoch 2, Loss 0.28966638445854187
Epoch 3, Loss 0.23800082504749298
Epoch 4, Loss 0.1994795948266983
Epoch 5, Loss 0.17243745923042297
Epoch 6, Loss 0.1541115790605545
Epoch 7, Loss 0.14094488322734833
Epoch 8, Loss 0.13122762739658356
Epoch 9, Loss 0.12397663295269012
Epoch 10, Loss 0.11859326809644699
Epoch 11, Loss 0.11450060456991196
Epoch 12, Loss 0.1110667809844017
Epoch 13, Loss 0.10806529223918915
Epoch 14, Loss 0.10526511818170547
Epoch 15, Loss 0.10284237563610077
Epoch 16, Loss 0.10071488469839096
Epoch 17, Loss 0.09873451292514801
Epoch 18, Loss 0.09690078347921371
Epoch 19, Loss 0.0951591208577156
Epoch 20, Loss 0.0934789851307869
Epoch 21, Loss 0.09188125282526016
Epoch 22, Loss 0.09033752977848053
Epoch 23, Loss 0.08883313089609146
Epoch 24, Loss 0.08735959231853485
Epoch 25, Loss 0.0858839675784111
Epoch 26, Loss 0.08438540995121002
Epoch 27, Loss 0.08284483850002289
Epoch 28, Loss 0.08128354698419571
Epoc