In [1]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from sklearn import model_selection

In [3]:
DATA_DIR = CONFIG.CFG.DATA.BASE
K_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-3
NUM_EPOCHS = 1000
ES_PATIENCE = 20
QUANTILES = [0.2, 0.5, 0.8]
SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age']
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
kf = model_selection.KFold(K_FOLDS)
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [5]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [6]:
train_df[SCALE_COLUMNS] = MIN_MAX_SCALER.fit_transform(train_df[SCALE_COLUMNS])

In [7]:
# convert categoricals into dummies
train_df['Sex'] = pd.Categorical(train_df['Sex'], categories=SEX_COLUMNS)
train_df['SmokingStatus'] = pd.Categorical(train_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
train_df = train_df.join(pd.get_dummies(train_df['Sex']))
train_df = train_df.join(pd.get_dummies(train_df['SmokingStatus']))

In [8]:
# REMOVE THE ONES FROM THE TRAIN_DF THAT ARE PRESENT IN TEST_DF AS WELL
TEST_PATIENTS = test_df['Patient'].unique().tolist()
valid_df = train_df[train_df['Patient'].isin(TEST_PATIENTS)]
train_df = train_df[~train_df['Patient'].isin(TEST_PATIENTS)]
TRAIN_PATIENTS = train_df['Patient'].unique().tolist()

In [9]:
valid_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
1504,ID00419637202311204720264,0.07971,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
1505,ID00419637202311204720264,0.086957,0.364681,0.302311,0.615385,Male,Ex-smoker,1,0,0,1,0
1506,ID00419637202311204720264,0.101449,0.351041,0.288097,0.615385,Male,Ex-smoker,1,0,0,1,0
1507,ID00419637202311204720264,0.108696,0.339555,0.276128,0.615385,Male,Ex-smoker,1,0,0,1,0
1508,ID00419637202311204720264,0.130435,0.342965,0.279682,0.615385,Male,Ex-smoker,1,0,0,1,0


In [10]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,0.007246,0.26705,0.236393,0.769231,Male,Ex-smoker,1,0,0,1,0
1,ID00007637202177411956430,0.072464,0.248923,0.215941,0.769231,Male,Ex-smoker,1,0,0,1,0
2,ID00007637202177411956430,0.086957,0.221464,0.18496,0.769231,Male,Ex-smoker,1,0,0,1,0
3,ID00007637202177411956430,0.101449,0.23636,0.201767,0.769231,Male,Ex-smoker,1,0,0,1,0
4,ID00007637202177411956430,0.115942,0.2229,0.18658,0.769231,Male,Ex-smoker,1,0,0,1,0


In [11]:
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [12]:
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')

In [13]:
# have to make it categorical coz sub's sex column has males only
sub_df['Sex'] = pd.Categorical(sub_df['Sex'], categories=SEX_COLUMNS)
sub_df['SmokingStatus'] = pd.Categorical(sub_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)

In [14]:
sub_df = sub_df.join(pd.get_dummies(sub_df['Sex']))
sub_df = sub_df.join(pd.get_dummies(sub_df['SmokingStatus']))

In [15]:
sub_df[SCALE_COLUMNS] = MIN_MAX_SCALER.transform(sub_df[SCALE_COLUMNS])
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-0.050725,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-0.043478,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-0.036232,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-0.028986,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-0.021739,0.393575,0.332421,0.615385,Male,Ex-smoker,1,0,0,1,0


In [16]:
class PulmonaryDataset(torch.utils.data.Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [17]:
class PulmonaryModel(nn.Module):
    def __init__(self, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [18]:
def quantile_loss(preds, target, quantiles):
    assert not target.requires_grad
    assert preds.size(0) == target.size(0)
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    return loss

In [19]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [20]:
def train_one_epoch(model, train_data_loader, optimizer, epoch):
    model.train()
    train_losses = AverageMeter()

    for i, data in enumerate(train_data_loader):
        features = data['features']
        targets = data['target']

        features = features.to(DEVICE).float()
        targets = targets.to(DEVICE).float()

        model.zero_grad()
        out = model(features)
        loss = quantile_loss(out, targets, QUANTILES)
        train_losses.update(loss, features.size(0))
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch}, Loss: {train_losses.avg}")

In [24]:
def eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler):
    model.eval()

    with torch.no_grad():
        for i, data in enumerate(valid_data_loader):
            features = data['features']
            targets = data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()
            
            out = model(features)
            loss = quantile_loss(out, targets, QUANTILES)
            valid_loss.update(loss, features.size(0))
    
    if lr_scheduler is not None:
        lr_scheduler.step(valid_loss.avg)

In [22]:
for fold, (train_index, test_index) in enumerate(kf.split(TRAIN_PATIENTS)):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)

    df_train = train_df.iloc[train_index].reset_index(drop=True)
    df_valid = train_df.iloc[test_index].reset_index(drop=True)

    train_dataset = PulmonaryDataset(df_train, FV)
    valid_dataset = PulmonaryDataset(df_valid, FV)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=10,
        shuffle=True,
        num_workers=4
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=4,
        shuffle=False,
        num_workers=4
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.05, verbose=True)

    best_valid_loss = float('inf')
    valid_loss = AverageMeter()
    for epoch in range(NUM_EPOCHS):
        train_one_epoch(model, train_data_loader, optimizer, epoch)
        eval_one_epoch(model, valid_data_loader, valid_loss, lr_scheduler)

        if valid_loss.avg < best_valid_loss:
            best_valid_loss = valid_loss.avg
            print(f'Validation Loss improved to {valid_loss.avg } Saving model {fold}')
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, os.path.join(CONFIG.CFG.DATA.MODELS_OUT, f"model_fold_{fold}.pt"))

Epoch 0, Loss: 0.24952112138271332


NameError: name 'val_loss' is not defined

In [None]:
CONFIG.upload_to_kaggle("osicqrmodel", "OSIC QR Model", new=False)

In [None]:
models = []
for fold in range(K_FOLDS):
    model = PulmonaryModel(len(FV))
    model = model.to(DEVICE)
    checkpoint = torch.load(f"model_fold_{fold}.pt")
    model.load_state_dict(checkpoint['model_state_dict'])
    models.append(model)

In [None]:
test_dataset = PulmonaryDataset(test_df, FV)
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4
)

In [None]:
avg_preds = np.zeros(len(test_dataset), len(QUANTILES))
with torch.no_grad():
    for model in models:
        for j, test_data in enumerate(test_data_loader):
            features = test_data['features']
            targets = test_data['target']

            features = features.to(DEVICE).float()
            targets = targets.to(DEVICE).float()

            out = model(features)
            for ou in out.cpu().numpy().tolist():
                preds.append(ou)

In [None]:
# inverse the scaling operation for FVC
preds -= MIN_MAX_SCALER.min_[SCALE_COLUMNS.index('FVC')]
preds /= MIN_MAX_SCALER.scale_[SCALE_COLUMNS.index('FVC')]

In [None]:
preds