In [None]:
import sys, os
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.insert(0, os.path.abspath(os.pardir))
import CONFIG
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
import pydicom
import matplotlib.pyplot as plt
import cv2
import random
import torch.nn.functional as F
from sklearn import model_selection
from sklearn import preprocessing

In [None]:
DATA_DIR = CONFIG.CFG.DATA.BASE
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

QUANTILES = [0.2, 0.5, 0.8]
SCALE_COLUMNS = ['Weeks', 'FVC', 'Percent', 'Age'] #'Percent'
SCALE_COLUMNS = ['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']
SEX_COLUMNS = ['Male', 'Female']
SMOKING_STATUS_COLUMNS = ['Currently smokes', 'Ex-smoker', 'Never smoked']
FV = SEX_COLUMNS + SMOKING_STATUS_COLUMNS + SCALE_COLUMNS

# number of images used to create a single 3D array of the scan
NUM_IMAGES = 8
IMG_SIZE = 256
K_FOLDS = 5

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(2020)

In [None]:
kf = model_selection.KFold(K_FOLDS)
MIN_MAX_SCALER = preprocessing.MinMaxScaler()

In [None]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sub_df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
# remove the duplicates from the train_df
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])

In [None]:
# extract the Patient and weeks from the Patient_Week column
sub_df['Patient'] = sub_df['Patient_Week'].apply(lambda x: x.split('_')[0])
sub_df['Weeks'] = sub_df['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub_df.head()

In [None]:
# merge the sub_df with the test_df
sub_df = sub_df.drop('FVC', axis=1).merge(test_df.drop('Weeks', axis=1), on='Patient')
sub_df.head()

In [None]:
train_df['FROM'] = 'train'
test_df['FROM'] = 'val'
sub_df['FROM'] = 'test'

In [None]:
combined_df = train_df.append([test_df, sub_df])

In [None]:
# initialize base_week column
combined_df['Base_Week'] = combined_df['Weeks']
# make the weeks from sub_df to be np.nan so that when we calculate the base_week it comes from the test_df
combined_df.loc[combined_df['FROM'] == 'test', 'Base_Week'] = np.nan
# now calculate the min for each patient group and set it to the Base_Week column
combined_df['Base_Week'] = combined_df.groupby('Patient')['Base_Week'].transform('min')

In [None]:
# get the base_df (where the Base_Week == the min_week we calculated) so that we can get the base_fvc, base_age and base_percentage
base_df = combined_df[combined_df['Weeks'] == combined_df['Base_Week']]

In [None]:
base_df.rename(columns={
    'FVC': 'Base_FVC',
    'Percent': 'Base_Percent',
    'Age': 'Base_Age'
}, inplace=True)

In [None]:
combined_df = combined_df.merge(base_df[['Patient', 'Base_FVC', 'Base_Percent', 'Base_Age']], on='Patient', how='left')

In [None]:
combined_df['Weeks_Passed'] = combined_df['Weeks'] - combined_df['Base_Week']

In [None]:
MIN_MAX_SCALER.fit(combined_df[combined_df['FROM'] == 'train'][['Weeks_Passed', 'FVC', 'Percent', 'Age']])

In [None]:
combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']] = MIN_MAX_SCALER.transform(combined_df[['Weeks_Passed', 'Base_FVC', 'Base_Percent', 'Base_Age']])

In [None]:
# convert categoricals into dummies
combined_df['Sex'] = pd.Categorical(combined_df['Sex'], categories=SEX_COLUMNS)
combined_df['SmokingStatus'] = pd.Categorical(combined_df['SmokingStatus'], categories=SMOKING_STATUS_COLUMNS)
combined_df = combined_df.join(pd.get_dummies(combined_df['Sex']))
combined_df = combined_df.join(pd.get_dummies(combined_df['SmokingStatus']))

In [None]:
combined_df.drop_duplicates(inplace=True)

In [None]:
combined_df.reset_index(drop=True)

In [None]:
TRAIN_PATIENTS = train_df['Patient'].unique().tolist()
# gave the gdcm error
BAD_PATIENT_IDS = ['ID00011637202177653955184', 'ID00052637202186188008618']
ALL_TRAIN_PATIENTS = [pat for pat in TRAIN_PATIENTS if pat not in BAD_PATIENT_IDS]
ALL_TEST_PATIENTS = test_df['Patient'].unique().tolist()

In [None]:
def get_averaged_slices(patient_id, folder_path, num_images):
    # the preprocessed array with NUM_SLICES elements
    # TODO: Handle the case when the NUM_SLICES > the actual total slices
    # TODO: resize the image to 256 X 256?

    full_path = os.path.join(folder_path, patient_id)
    # list of all files in that path and sort them
    all_files = os.listdir(full_path)
    # sorted using the first number part of the file name
    all_files.sort(key = lambda x: int(x.split('.')[0]))

    # read all the dicom files for the patient into the slices list
    slices = [pydicom.read_file(os.path.join(full_path, s)) for s in all_files]
    # sort the slices using their order (file number works too)
    # slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))

    # final array containing averaged num_images images
    out_array = []

    # how many extra files while averaging all images into (num_images) images
    remainder_array_size = len(slices)%num_images

    # how many to average to get a single averaged image
    avging_array_size = len(slices)//num_images

    # get the first one with the remainder images
    first_array = []
    # select the first remainder + avg_arrray_size imgaes and average into one
    for slice in slices[:remainder_array_size+avging_array_size]:
        first_array.append(slice.pixel_array)
    first_avged_array = np.average(first_array, axis=0)
    first_resized = cv2.resize(first_avged_array, (IMG_SIZE, IMG_SIZE))
    out_array.append(first_resized)

    # after the first one get the remaining ones into out_array rolling averaging (avging_array_size) at a time.
    for i in range(remainder_array_size + avging_array_size, len(slices), avging_array_size):
        temp_array = []
        for slice in slices[i:i+avging_array_size]:
            temp_array.append(slice.pixel_array)
        avged_temp_array = np.average(temp_array, axis=0)
        avged_resized = cv2.resize(avged_temp_array, (IMG_SIZE, IMG_SIZE))
        out_array.append(avged_resized)
    
    return np.array(out_array)

In [None]:
array_from_id = {}

In [None]:
# store the train and test images in array_from_id
for id in ALL_TRAIN_PATIENTS:
    array_from_id[id] = get_averaged_slices(id, os.path.join(DATA_DIR, "train"), NUM_IMAGES)

for id in ALL_TEST_PATIENTS:
    array_from_id[id] = get_averaged_slices(id, os.path.join(DATA_DIR, "test"), NUM_IMAGES)

In [None]:
class PulmonaryDataset(Dataset):
    def __init__(self, df, FV, test=False):
        self.df = df
        self.test = test
        self.FV = FV

    def __getitem__(self, idx):
        return {
            'imgarray': torch.from_numpy(array_from_id[self.df.iloc[idx]['Patient']]).unsqueeze(0),
            # 'tabfeatures': torch.tensor(self.df[self.FV].iloc[idx].values),
            'target': torch.tensor(self.df['FVC'].iloc[idx])
        }

    def __len__(self):
        return len(self.df)

In [None]:
class PulmonaryModel(nn.Module):
    def __init__(self, cnn_output_size=10, in_features=9, out_quantiles=3):
        super(PulmonaryModel, self).__init__()

        self.conv_layer1 = self._make_conv_layer(1, 8)
        self.conv_layer2 = self._make_conv_layer(8, 32)
        self.conv_layer3 = self._make_conv_layer(32, 64)
        self.conv_layer4 = nn.Conv3d(64, 128, kernel_size=(1, 3, 3))
        self.conv_layer5 = nn.Conv3d(128, 128, kernel_size=(1,3,3), padding=0)

        self.fc1 = nn.Linear(86528, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, cnn_output_size)

    def _make_conv_layer(self, in_c, out_c):
        return nn.Sequential(
            nn.Conv3d(in_c, out_c, kernel_size=(2,3,3), padding=0),
            nn.LeakyReLU(),
            nn.Conv3d(out_c, out_c, kernel_size=(2, 3, 3), padding=1),
            nn.LeakyReLU(),
            nn.MaxPool3d((2,2,2)),
        )

    def forward(self, x):
        x = self.conv_layer1(x)
        x = self.conv_layer2(x)
        x = self.conv_layer3(x)
        x = self.conv_layer4(x)
        x = self.conv_layer5(x)

        # flatten
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

In [None]:
model = PulmonaryModel()
model.to(DEVICE)

In [None]:
train_dataset = PulmonaryDataset(train_df, FV, test=False)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=4,
)

In [None]:
for i, data in enumerate(train_data_loader):
    imgarray = data['imgarray'].to(DEVICE).float()
    out = model(imgarray)
    print(out)
    break

In [None]:
for key in array_from_id:
    print(array_from_id[key].shape)