In [1]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import pydicom
import random
from sklearn import preprocessing

In [21]:
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(1999)

In [33]:
DATA_DIR = CONFIG.CFG.DATA.BASE
BATCH_SIZE = 64

SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age'] #'Percent'
SCALE_COLUMNS = ['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SCALE_COLUMNS + SEX_COLUMNS + SMOKING_STATUS_COLUMNS
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [10]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
# remove the duplicates from the train_df
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])

In [11]:
# extract the Patient and weeks from the Patient_Week column
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [12]:
# merge the sub_df with the test_df
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')
sub_df.head()

Unnamed: 0,Patient_Week,Confidence,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,100,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker
1,ID00419637202311204720264_-11,100,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker
2,ID00419637202311204720264_-10,100,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker
3,ID00419637202311204720264_-9,100,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker
4,ID00419637202311204720264_-8,100,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker


In [13]:
train_df['FROM'] = 'train'
test_df['FROM'] = 'val'
sub_df['FROM'] = 'test'

In [14]:
combined_df = train_df.append([test_df, sub_df])

In [15]:
# initialize base_week column
combined_df['Base_Week'] = combined_df['Weeks']
# make the weeks from sub_df to be np.nan so that when we calculate the base_week it comes from the test_df
combined_df.loc[combined_df['FROM'] == 'test', 'Base_Week'] = np.nan
# now calculate the min for each patient group and set it to the Base_Week column
combined_df['Base_Week'] = combined_df.groupby('Patient')['Base_Week'].transform('min')

In [16]:
# get the base_df (where the Base_Week == the min_week we calculated) so that we can get the base_fvc, base_age and base_percentage
base_df = combined_df[combined_df['Weeks'] == combined_df['Base_Week']]

In [17]:
base_df.rename(columns={
    'FVC': 'Base_FVC',
    'Percent': 'Base_Percent',
    'Age': 'Base_Age'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [18]:
combined_df = combined_df.merge(base_df[['Patient', 'Base_FVC', 'Base_Percent', 'Base_Age']], on='Patient', how='left')

In [19]:
combined_df['Weeks_Passed'] = combined_df['Weeks'] - combined_df['Base_Week']

In [22]:
MIN_MAX_SCALER.fit(combined_df[combined_df['FROM'] == 'train'][['Weeks_Passed', 'FVC', 'Percent', 'Age']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [23]:
combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']] = MIN_MAX_SCALER.transform(combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']])

In [24]:
# convert categoricals into dummies
combined_df['Sex'] = pd.Categorical(combined_df['Sex'], categories=SEX_COLUMNS)
combined_df['SmokingStatus'] = pd.Categorical(combined_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
combined_df = combined_df.join(pd.get_dummies(combined_df['Sex']))
combined_df = combined_df.join(pd.get_dummies(combined_df['SmokingStatus']))

In [25]:
combined_df.drop_duplicates(inplace=True)

In [26]:
combined_df.reset_index(drop=True)

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age,Weeks_Passed,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.000000,1,0,0,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.142857,1,0,0,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.174603,1,0,0,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.206349,1,0,0,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,,,-4.0,0.267050,0.236393,0.769231,0.238095,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2265,ID00426637202313170790466,129,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_129,100.0,0.0,0.376525,0.345604,0.615385,2.047619,1,0,0,0,1
2266,ID00426637202313170790466,130,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_130,100.0,0.0,0.376525,0.345604,0.615385,2.063492,1,0,0,0,1
2267,ID00426637202313170790466,131,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_131,100.0,0.0,0.376525,0.345604,0.615385,2.079365,1,0,0,0,1
2268,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,ID00426637202313170790466_132,100.0,0.0,0.376525,0.345604,0.615385,2.095238,1,0,0,0,1


In [30]:
class OSICLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(OSICLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.lc = nn.Linear(hidden_size, 50)
        self.lc2 = nn.Linear(50, output_size)

    def forward(self, X):
        out, _ = self.lstm(X)
        out = self.lc(out)
        out = self.lc2(out)
        return out

In [35]:
model = OSICLSTM(len(FV)+1, 200, 3, 1)
model.to(DEVICE)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [36]:
model

OSICLSTM(
  (lstm): LSTM(10, 200, num_layers=3, batch_first=True)
  (lc): Linear(in_features=200, out_features=50, bias=True)
  (lc2): Linear(in_features=50, out_features=1, bias=True)
)

In [37]:
combined_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,FROM,Patient_Week,Confidence,Base_Week,Base_FVC,Base_Percent,Base_Age,Weeks_Passed,Male,Female,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,,,-4.0,0.26705,0.236393,0.769231,0.0,1,0,0,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,,,-4.0,0.26705,0.236393,0.769231,0.142857,1,0,0,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,,,-4.0,0.26705,0.236393,0.769231,0.174603,1,0,0,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,,,-4.0,0.26705,0.236393,0.769231,0.206349,1,0,0,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,,,-4.0,0.26705,0.236393,0.769231,0.238095,1,0,0,1,0


In [38]:
TRAIN_PATIENTS = combined_df[combined_df['FROM'] == "train"]['Patient'].unique().tolist()

In [59]:
for patient in TRAIN_PATIENTS:
    patient_data = combined_df[combined_df["Patient"] == patient][FV].values
    target = combined_df[combined_df['Patient'] == patient]["FVC"].values
    target = torch.tensor(target).float().to(DEVICE)

    print(target)

    data_tensor = torch.zeros((patient_data.shape[0], 10))
    data_tensor[:, 1:] = torch.tensor(patient_data).float()

    for i, data in enumerate(data_tensor):
        data = data.to(DEVICE)
        out = model(data.view(1,1,-1))
        data_tensor[i+1, 0] = out.cpu()
    break

tensor([2315., 2214., 2061., 2144., 2069., 2101., 2000., 2064., 2057.],
       device='cuda:0')


IndexError: index 9 is out of bounds for dimension 0 with size 9

In [64]:
for epoch in range(1000):
    total_loss = 0
    for patient in TRAIN_PATIENTS:
        patient_loss = 0
        model.zero_grad()

        patient_data = combined_df[combined_df["Patient"] == patient][FV].values
        target = combined_df[combined_df["Patient"] == patient]['FVC'].values

        target = torch.tensor(target).float().to(DEVICE)

        data_tensor = torch.zeros((patient_data.shape[0], len(FV)+1))
        data_tensor[:, 1:] = torch.tensor(patient_data).float()
        data_tensor = data_tensor.to(DEVICE)

        loss = 0
        for i in range(data_tensor.size()[0] - 1):
            out = model(data_tensor[i].view(1,1,-1))
            loss += loss_function(out.view(1), target[i+1])
            data_tensor[i+1, 0] = out.view(1)
            patient_loss += loss.item()
        # for data in patient_data:
        #     out = model(data.view(1, 1, -1))
        #     print(out)
        loss.backward()
        optimizer.step()
        total_loss += patient_loss/patient_data.shape[0]
    print(f"Epoch {epoch}, loss={total_loss}")

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1, 1, 10]], which is output 0 of ViewBackward, is at version 9; expected version 8 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [96]:
sample_u = torch.tensor([[-12, 3020, 73]])
sample = torch.tensor(min_max_scaler.transform(sample_u)).float()
sample_u[:, 0]

tensor([-12])

In [97]:
with torch.no_grad():
    for i in range(120):
        out = model(sample.view(1,1,-1))
        out = min_max_scaler.inverse_transform(out.squeeze(dim=0))
        print(out)
        prev_zero = sample_u[:,0]
        prev_three = sample_u[:,2]
        out[:, 0] = prev_zero + i
        out[:, 2] = prev_three
        print(out, "\n")
        sample = torch.tensor(min_max_scaler.transform(out)).float()

[[ -14.31420559 2945.78741264   73.12457055]]
[[ -12.         2945.78741264   73.        ]] 

[[ -14.38764644 2874.82694507   73.13517064]]
[[ -11.         2874.82694507   73.        ]] 

[[ -13.16220379 2808.05978775   73.1353845 ]]
[[ -10.         2808.05978775   73.        ]] 

[[ -11.92862207 2744.85890365   73.13483357]]
[[  -9.         2744.85890365   73.        ]] 

[[ -10.68775672 2684.68925214   73.13386887]]
[[  -8.         2684.68925214   73.        ]] 

[[  -9.44036448 2627.08802807   73.13275075]]
[[  -7.         2627.08802807   73.        ]] 

[[  -8.1870088  2571.65270555   73.13167447]]
[[  -6.         2571.65270555   73.        ]] 

[[  -6.92818731 2518.03074229   73.13079345]]
[[  -5.         2518.03074229   73.        ]] 

[[  -5.66428661 2465.91077852   73.13021231]]
[[  -4.         2465.91077852   73.        ]] 

[[  -4.39559871 2415.01715708   73.13001239]]
[[  -3.         2415.01715708   73.        ]] 

[[  -3.12236214 2365.10494161   73.13024718]]
[[-2.00000000e

In [None]:
for i in range(5):
    print(sample)
    out = model(sample.view(1, 1, -1))
    print(out)
    sample = out

In [None]:
with torch.no_grad():
    input = torch.randn(2, 5, 4)
    print(input)
    print()
    out = model(input)
    print(out)
    print(out.shape)