In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata

from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lifelines.utils import concordance_index
from metric import score
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
train_path = "data/train.csv"
test_path = "data/test.csv"
sample_path = "data/sample_submission.csv"
data_dict_path = "data/data_dictionary.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
data_dict_df = pd.read_csv(data_dict_path)

from lifelines import KaplanMeierFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

In [3]:
RMV = ["ID", "efs", "efs_time", "y"]
FEATURES = [c for c in train.columns if not c in RMV]
# print(f"Total features: {len(FEATURES)} - {FEATURES}")

CATS = []
NULLS = ["Not done", "Not tested", "N/A", "N/A, Mel not given", "No drugs reported"]
NUMS = []

for c in FEATURES:
    if train[c].dtype=="object":
        for null in NULLS:
            if null in train[c].unique():
                train[c] = train[c].fillna(null)
                test[c] = test[c].fillna(null)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")

        CATS.append(c)
    else:
        NUMS.append(c)
        train[c] = train[c].fillna(-1)
        test[c] = test[c].fillna(-1)

CAT_SIZE = []
CAT_EMB = []
NUMS = []

combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# print("We LABEL ENCODE the CATEGORICAL FEATURES: ")

for c in FEATURES:
    if c in CATS:
        # LABEL ENCODE
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        #combined[c] = combined[c].astype("category")

        n = combined[c].nunique()
        mn = combined[c].min()
        mx = combined[c].max()
        # print(f'{c} has ({n}) unique values')

        CAT_SIZE.append(mx+1) 
        CAT_EMB.append( int(np.ceil( np.sqrt(mx+1))) ) 
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
            
        m = combined[c].mean()
        s = combined[c].std()
        combined[c] = (combined[c]-m)/s
        combined[c] = combined[c].fillna(0)
        
        NUMS.append(c)
        
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

In [5]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.out = nn.Linear(128, 1)
        self.relu = nn.LeakyReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.out(x)

In [6]:
class SurvivalModel(nn.Module):
    def __init__(self, cat_sizes, cat_emb_sizes, num_features):
        super(SurvivalModel, self).__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(size, emb_size) 
                                         for size, emb_size in zip(cat_sizes, cat_emb_sizes)])
        self.num_features = num_features
        total_emb_size = sum(cat_emb_sizes) + num_features
        
        self.fc1 = nn.Linear(total_emb_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)
        
    def forward(self, x_cat, x_num):
        embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(embs + [x_num], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        
        return self.fc3(x)

In [7]:
def train_model(model, train_loader, valid_loader, optimizer, criterion, device, epochs):
    for epoch in range(epochs):
        model.train()
        for x_cat, x_num, y in train_loader:
            x_cat, x_num, y = x_cat.to(device), x_num.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x_cat, x_num)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
        
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for x_cat, x_num, y in valid_loader:
                x_cat, x_num, y = x_cat.to(device), x_num.to(device), y.to(device)
                output = model(x_cat, x_num)
                valid_loss += criterion(output, y).item()
        print(f"Epoch {epoch+1}, Validation Loss: {valid_loss/len(valid_loader)}")

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [9]:
EPOCHS = 4
REPEATS = 3
FOLDS = 5
kf = KFold(n_splits=FOLDS, random_state=42, shuffle=True)

oof_nn = np.zeros(len(train))
pred_nn = np.zeros(len(test))

for r in range(REPEATS):
    print(f"### REPEAT {r+1} ###")
    
    for i, (train_index, test_index) in enumerate(kf.split(train)):
        print(f"### Fold {i+1} ###")
        
        X_train_cats = torch.tensor(train.loc[train_index, CATS].values, dtype=torch.long)
        X_train_nums = torch.tensor(train.loc[train_index, NUMS].values, dtype=torch.float32)
        y_train = torch.tensor(train.loc[train_index, "y"].values, dtype=torch.float32).unsqueeze(1)
        
        X_valid_cats = torch.tensor(train.loc[test_index, CATS].values, dtype=torch.long)
        X_valid_nums = torch.tensor(train.loc[test_index, NUMS].values, dtype=torch.float32)
        y_valid = torch.tensor(train.loc[test_index, "y"].values, dtype=torch.float32).unsqueeze(1)
        
        train_dataset = TensorDataset(X_train_cats, X_train_nums, y_train)
        valid_dataset = TensorDataset(X_valid_cats, X_valid_nums, y_valid)
        
        train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=512)
        
        model = SurvivalModel(CAT_SIZE, CAT_EMB, len(NUMS)).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        train_model(model, train_loader, valid_loader, optimizer, criterion, device, EPOCHS)
        
        model.eval()
        with torch.no_grad():
            oof_nn[test_index] += model(X_valid_cats.to(device), X_valid_nums.to(device)).cpu().numpy().flatten()

        X_test_cats = torch.tensor(test[CATS].values, dtype=torch.long)
        X_test_nums = torch.tensor(test[NUMS].values, dtype=torch.float32)
        with torch.no_grad():
            pred_nn += model(X_test_cats.to(device), X_test_nums.to(device)).cpu().numpy().flatten()

oof_nn /= REPEATS
pred_nn /= (FOLDS * REPEATS)

### REPEAT 1 ###
### Fold 1 ###
Epoch 1, Validation Loss: 0.028405260915557545
Epoch 2, Validation Loss: 0.027676138716439407
Epoch 3, Validation Loss: 0.02714888487632076
Epoch 4, Validation Loss: 0.026715842851748068
### Fold 2 ###
Epoch 1, Validation Loss: 0.02892059072231253
Epoch 2, Validation Loss: 0.02791445019344489
Epoch 3, Validation Loss: 0.027547708712518215
Epoch 4, Validation Loss: 0.027293898010005552
### Fold 3 ###
Epoch 1, Validation Loss: 0.02919255414356788
Epoch 2, Validation Loss: 0.027525535163780052
Epoch 3, Validation Loss: 0.0267950890896221
Epoch 4, Validation Loss: 0.026592797910173733
### Fold 4 ###
Epoch 1, Validation Loss: 0.02967660517121355
Epoch 2, Validation Loss: 0.028026817521701258
Epoch 3, Validation Loss: 0.027579618617892265
Epoch 4, Validation Loss: 0.027319165257116158
### Fold 5 ###
Epoch 1, Validation Loss: 0.028768180248637993
Epoch 2, Validation Loss: 0.02807827712967992
Epoch 3, Validation Loss: 0.027537064782033365
Epoch 4, Validation Los

In [10]:
from metric import score

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_nn
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for NN =",m)


Overall CV for NN = 0.6500187672623404
