In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from lifelines.utils import concordance_index

In [3]:
train_path = "data/train.csv"
test_path = "data/test.csv"
sample_path = "data/sample_submission.csv"
data_dict = "data/data_dictionary.csv"

train_df = pd.read_csv("data/train.csv")
epsilon = 1e-5

data_info_df = pd.read_csv(data_dict)
label_encoder = LabelEncoder()

for index, row in data_info_df.iterrows():
    if row["type"] == "Categorical":
        train_df[row["variable"]] = label_encoder.fit_transform(train_df[row["variable"]])
    else:
        train_df[row["variable"]] = train_df[row["variable"]].fillna(-1)
        
train_df["y"] = train_df["efs"] / (train_df["efs_time"] + epsilon)
train_df.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time,y
0,0,7,0,7,0,-1.0,-1.0,0,0,6.0,...,2,1,8.0,0,2.0,0,10.0,0,42.356,0.0
1,1,2,0,1,0,2.0,8.0,6,0,6.0,...,1,1,8.0,0,2.0,2,10.0,1,4.672,0.214041
2,2,7,0,7,0,2.0,8.0,0,0,6.0,...,1,1,8.0,0,2.0,0,10.0,0,19.793,0.0
3,3,0,0,1,0,2.0,8.0,0,0,6.0,...,2,1,8.0,0,2.0,0,10.0,0,102.349,0.0
4,4,0,0,7,0,2.0,8.0,0,0,6.0,...,1,0,8.0,0,2.0,0,10.0,0,16.223,0.0


In [10]:
X = train_df.drop(columns=["efs", "efs_time", "ID", "y"])
y = train_df["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    # cat_features=['y'],
    verbose=200
)

# Fit the Model
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)

0:	learn: 0.1244296	test: 0.1236740	best: 0.1236740 (0)	total: 163ms	remaining: 2m 42s
200:	learn: 0.1085259	test: 0.1139788	best: 0.1139788 (200)	total: 1.73s	remaining: 6.88s
400:	learn: 0.1022008	test: 0.1129205	best: 0.1129205 (400)	total: 3.57s	remaining: 5.33s
600:	learn: 0.0974920	test: 0.1125143	best: 0.1125143 (600)	total: 5.37s	remaining: 3.57s
800:	learn: 0.0936760	test: 0.1124745	best: 0.1124483 (788)	total: 7.13s	remaining: 1.77s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1124482572
bestIteration = 788

Shrink model to first 789 iterations.


<catboost.core.CatBoostRegressor at 0x22dd9568210>

In [11]:
y_pred = model.predict(X_test)

# Evaluate the Model
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print Metrics
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

RMSE: 0.1124
MAE: 0.0833
R²: 0.1816
