In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Load the datasets
data_dictionary = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv')
sample_submission = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')
test_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')

In [3]:
# Display the first few rows of the training data to understand its structure
print(train_data.head())

   ID                       dri_score psych_disturb    cyto_score diabetes  \
0   0  N/A - non-malignant indication            No           NaN       No   
1   1                    Intermediate            No  Intermediate       No   
2   2  N/A - non-malignant indication            No           NaN       No   
3   3                            High            No  Intermediate       No   
4   4                            High            No           NaN       No   

   hla_match_c_high  hla_high_res_8          tbi_status arrhythmia  \
0               NaN             NaN              No TBI         No   
1               2.0             8.0  TBI +- Other, >cGy         No   
2               2.0             8.0              No TBI         No   
3               2.0             8.0              No TBI         No   
4               2.0             8.0              No TBI         No   

   hla_low_res_6  ...          tce_div_match donor_related  \
0            6.0  ...                    NaN    

In [4]:
# Display information about the training data
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      28800 non-null  int64  
 1   dri_score               28646 non-null  object 
 2   psych_disturb           26738 non-null  object 
 3   cyto_score              20732 non-null  object 
 4   diabetes                26681 non-null  object 
 5   hla_match_c_high        24180 non-null  float64
 6   hla_high_res_8          22971 non-null  float64
 7   tbi_status              28800 non-null  object 
 8   arrhythmia              26598 non-null  object 
 9   hla_low_res_6           25530 non-null  float64
 10  graft_type              28800 non-null  object 
 11  vent_hist               28541 non-null  object 
 12  renal_issue             26885 non-null  object 
 13  pulm_severe             26665 non-null  object 
 14  prim_disease_hct        28800 non-null

In [5]:
# Display the data dictionary to understand the features
print(data_dictionary.head())

           variable                                        description  \
0         dri_score                         Refined disease risk index   
1     psych_disturb                            Psychiatric disturbance   
2        cyto_score                                  Cytogenetic score   
3          diabetes                                           Diabetes   
4  hla_match_c_high  Recipient / 1st donor allele level (high resol...   

          type                                             values  
0  Categorical  ['Intermediate' 'High' 'N/A - non-malignant in...  
1  Categorical                        ['Yes' 'No' nan 'Not done']  
2  Categorical  ['Intermediate' 'Favorable' 'Poor' 'TBD' nan '...  
3  Categorical                        ['No' 'Yes' nan 'Not done']  
4    Numerical                                                NaN  


In [6]:
# Preprocessing data
# Separate features and target variable
X = train_data.drop(columns=['efs'])  # Drop the target column
y = train_data['efs']  # Target column

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

In [7]:
# Create a Column Transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
            ('passthrough', 'passthrough')  # Keep numerical columns as they are
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
            ('onehot', OneHotEncoder())  # Apply One-Hot Encoding to categorical columns
        ]), categorical_cols)
    ]
)

# Create a pipeline that first transforms the data and then fits the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Model step
])


In [8]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = model.predict(X_val)

# Calculate Mean Squared Error to evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.009283784722222223


In [9]:
# Make predictions on the test data
# Check the columns in test_data
print("Test Data Columns:", test_data.columns)

Test Data Columns: Index(['ID', 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
       'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
       'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
       'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
       'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
       'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
       'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
       'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
       'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
       'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
       'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'hla_low_res_8', 'cardiac',

In [10]:
# Drop target columns if they exist
columns_to_drop = ['efs', 'efs_time']  # Adjust this list based on actual columns
test_features = test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns], errors='ignore')

In [11]:
# Ensure test_features contains the same columns as the training features
test_features = test_features.reindex(columns=X.columns, fill_value=0)

In [12]:
# Make predictions
test_predictions = model.predict(test_features)

In [13]:
# Prepare the submission DataFrame
submission = pd.DataFrame({'ID': test_data['ID'], 'prediction': test_predictions})

# Save the submission file to the specified path
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created successfully at /kaggle/working/submission.csv.")

Submission file created successfully at /kaggle/working/submission.csv.
