In [1]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from sklearn import model_selection
from tqdm.autonotebook import tqdm

In [3]:
DATA_DIR = CONFIG.CFG.DATA.BASE
K_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-3
NUM_EPOCHS = 1100
# print loss every
PRINT_EVERY = 50

QUANTILES = [0.2, 0.5, 0.8]
SCALE_COLUMNS = ['Weeks', 'FVC', 'Age'] #'Percent'
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [5]:
kf = model_selection.KFold(K_FOLDS)
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [6]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [7]:
train_df[SCALE_COLUMNS] = MIN_MAX_SCALER.fit_transform(train_df[SCALE_COLUMNS])

In [8]:
# convert categoricals into dummies
train_df['Sex'] = pd.Categorical(train_df['Sex'], categories=SEX_COLUMNS)
train_df['SmokingStatus'] = pd.Categorical(train_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
train_df = train_df.join(pd.get_dummies(train_df['Sex']))
train_df = train_df.join(pd.get_dummies(train_df['SmokingStatus']))

In [9]:
# REMOVE THE ONES FROM THE TRAIN_DF THAT ARE PRESENT IN TEST_DF AS WELL
TEST_PATIENTS = test_df['Patient'].unique().tolist()
valid_df = train_df[train_df['Patient'].isin(TEST_PATIENTS)]
train_df = train_df[~train_df['Patient'].isin(TEST_PATIENTS)]
TRAIN_PATIENTS = train_df['Patient'].unique().tolist()

In [10]:
valid_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
1504,ID00419637202311204720264,0.07971,0.393575,70.186855,0.615385,Male,Ex-smoker,1,0,0,1,0
1505,ID00419637202311204720264,0.086957,0.364681,66.445106,0.615385,Male,Ex-smoker,1,0,0,1,0
1506,ID00419637202311204720264,0.101449,0.351041,64.678814,0.615385,Male,Ex-smoker,1,0,0,1,0
1507,ID00419637202311204720264,0.108696,0.339555,63.19141,0.615385,Male,Ex-smoker,1,0,0,1,0
1508,ID00419637202311204720264,0.130435,0.342965,63.632983,0.615385,Male,Ex-smoker,1,0,0,1,0


In [11]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,0.007246,0.26705,58.253649,0.769231,Male,Ex-smoker,1,0,0,1,0
1,ID00007637202177411956430,0.072464,0.248923,55.712129,0.769231,Male,Ex-smoker,1,0,0,1,0
2,ID00007637202177411956430,0.086957,0.221464,51.862104,0.769231,Male,Ex-smoker,1,0,0,1,0
3,ID00007637202177411956430,0.101449,0.23636,53.950679,0.769231,Male,Ex-smoker,1,0,0,1,0
4,ID00007637202177411956430,0.115942,0.2229,52.063412,0.769231,Male,Ex-smoker,1,0,0,1,0


In [12]:
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [13]:
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')

In [14]:
# have to make it categorical coz sub's sex column has males only
sub_df['Sex'] = pd.Categorical(sub_df['Sex'], categories=SEX_COLUMNS)
sub_df['SmokingStatus'] = pd.Categorical(sub_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)

In [15]:
sub_df = sub_df.join(pd.get_dummies(sub_df['Sex']))
sub_df = sub_df.join(pd.get_dummies(sub_df['SmokingStatus']))

In [16]:
sub_df[SCALE_COLUMNS] = MIN_MAX_SCALER.transform(sub_df[SCALE_COLUMNS])
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-0.050725,0.393575,70.186855,0.615385,Male,Ex-smoker,1,0,0,1,0
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-0.043478,0.393575,70.186855,0.615385,Male,Ex-smoker,1,0,0,1,0
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-0.036232,0.393575,70.186855,0.615385,Male,Ex-smoker,1,0,0,1,0
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-0.028986,0.393575,70.186855,0.615385,Male,Ex-smoker,1,0,0,1,0
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-0.021739,0.393575,70.186855,0.615385,Male,Ex-smoker,1,0,0,1,0


In [17]:
class PulmonaryDataset(torch.utils.data.Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [18]:
class PulmonaryModel(nn.Module):
    def __init__(self, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [19]:
def quantile_loss(preds, target, quantiles):
    assert not target.requires_grad
    assert preds.size(0) == target.size(0)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    return loss

In [20]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [21]:
def train_one_epoch(model, train_data_loader, optimizer, train_loss, epoch):
    model.train()
    for i, data in enumerate(train_data_loader):
        features = data['features']
        targets = data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        model.zero_grad()
        out = model(features)
        loss = quantile_loss(out, targets, QUANTILES)
        train_loss.update(loss, features.size(0))
        loss.backward()
        optimizer.step()

In [22]:
def eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler):
    model.eval()

    with torch.no_grad():
        for i, data in enumerate(valid_data_loader):
            features = data['features']
            targets = data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()
            
            out = model(features)
            loss = quantile_loss(out, targets, QUANTILES)
            valid_loss.update(loss, features.size(0))
    
    if lr_scheduler is not None:
        lr_scheduler.step(valid_loss.avg)

In [23]:
for fold, (train_index, test_index) in enumerate(kf.split(TRAIN_PATIENTS)):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)

    df_train = train_df.iloc[train_index].reset_index(drop=True)
    df_valid = train_df.iloc[test_index].reset_index(drop=True)

    train_dataset = PulmonaryDataset(df_train, FV)
    valid_dataset = PulmonaryDataset(df_valid, FV)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=10,
        shuffle=True,
        num_workers=4
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=4,
        shuffle=False,
        num_workers=4
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.05, verbose=True)

    best_valid_loss = float('inf')

    train_loss = AverageMeter()
    valid_loss = AverageMeter()
    tq = tqdm(range(NUM_EPOCHS), desc=f"Fold {fold}")
    for epoch in tq:
        train_one_epoch(model, train_data_loader, optimizer, train_loss, epoch)
        eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler)
        
        tq.set_postfix(val_loss=valid_loss.avg.item())

        if valid_loss.avg < best_valid_loss:
            best_valid_loss = valid_loss.avg
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, f"model_fold_{fold}.pt"))

HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=1100.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=1100.0, style=ProgressStyle(description_widt…

Epoch   106: reducing learning rate of group 0 to 1.5000e-04.
Epoch   117: reducing learning rate of group 0 to 7.5000e-06.



HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=1100.0, style=ProgressStyle(description_widt…

Epoch    77: reducing learning rate of group 0 to 1.5000e-04.



HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=1100.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=1100.0, style=ProgressStyle(description_widt…




In [24]:
CONFIG.upload_to_kaggle("osicqrmodel", "OSIC QR Model", new=False)

In [25]:
models = []
for fold in range(K_FOLDS):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)
    checkpoint = torch.load(os.path.join(CONFIG.CFG.DATA.MODELS_OUT, f"model_fold_{fold}.pt"))
    model.load_state_dict(checkpoint['model_state_dict'])
    models.append(model)

In [26]:
test_dataset = PulmonaryDataset(sub_df, FV)
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4
)

In [27]:
avg_preds = np.zeros((len(test_dataset), len(QUANTILES)))
with torch.no_grad():
    for model in models:
        preds = []
        for j, test_data in enumerate(test_data_loader):
            features = test_data['features']
            targets = test_data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()

            out = model(features)
            preds.append(out)
        preds = torch.cat(preds, dim=0).cpu().numpy()
        avg_preds += preds
avg_preds /= len(models)

In [28]:
avg_preds

array([[0.38529707, 0.3879777 , 0.38941317],
       [0.38544108, 0.38811228, 0.38956956],
       [0.38558556, 0.38824735, 0.38972664],
       ...,
       [0.38247041, 0.38580647, 0.38374192],
       [0.38253275, 0.38587356, 0.38381885],
       [0.38259862, 0.38594451, 0.3838999 ]])

In [29]:
# inverse the scaling operation for FVC
avg_preds -= MIN_MAX_SCALER.min_[SCALE_COLUMNS.index('FVC')]
avg_preds /= MIN_MAX_SCALER.scale_[SCALE_COLUMNS.index('FVC')]

In [30]:
avg_preds[:100]

array([[2973.8752516 , 2988.81171913, 2996.81019442],
       [2974.67771289, 2989.56160629, 2997.68160322],
       [2975.4827647 , 2990.3142168 , 2998.55683136],
       [2976.28771687, 2991.06662805, 2999.43199308],
       [2977.05318031, 2991.77134728, 3000.2484365 ],
       [2977.78084884, 2992.43020115, 3001.0086525 ],
       [2978.49825494, 2993.07699916, 3001.75950279],
       [2979.21237309, 2993.72030995, 3002.50746367],
       [2979.93725183, 2994.37315249, 3003.26525521],
       [2980.6624959 , 2995.02629395, 3004.02331245],
       [2981.38770676, 2995.6794354 , 3004.78146932],
       [2982.1154417 , 2996.33456955, 3005.54128678],
       [2982.84404016, 2996.9902683 , 3006.30180168],
       [2983.5726054 , 2997.64590063, 3007.06211731],
       [2984.30110421, 2998.30159938, 3007.82263222],
       [2984.97267816, 2998.91462109, 3008.52432919],
       [2985.35507777, 2999.31113567, 3008.92811713],
       [2985.73747737, 2999.7077831 , 3009.33180544],
       [2986.12000983, 3000.