In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
org_data_df = pd.read_csv('cirrhosis.csv')

In [3]:
# Drop unnecessary columns
train_df = train_df.drop(['id'], axis=1)
org_data_df = org_data_df.drop(['ID'], axis=1)

In [4]:
# Combine the competition train data and original data
combined_data_df = pd.concat([train_df, org_data_df])

In [5]:
features = combined_data_df.drop(['Status'], axis=1).columns

In [None]:
#Checking for null values
combined_data_df.isnull().sum()

In [6]:
# Handling missing values
missing_features = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Cholesterol', 'Copper',
                    'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']

In [7]:
for feature in missing_features:
    combined_data_df[feature].fillna(combined_data_df[feature].mode()[0], inplace=True)

In [12]:
encoder = LabelEncoder()
categorical_features = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema','Status']
for feature in categorical_features:
    combined_data_df[feature] = encoder.fit_transform(combined_data_df[feature])


In [15]:
#categorical_features.remove('Status')
for feature in categorical_features:
    test_df[feature] = encoder.transform(test_df[feature])


In [16]:
X = combined_data_df.drop(['Status'], axis=1)
y = combined_data_df['Status']


In [17]:
RANDOM_SEED = np.random.seed(423)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [20]:
#XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
preds_xgb = xgb_model.predict(X_test)
y_pred_probs_xgb = xgb_model.predict_proba(X_test)

In [21]:
# Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
preds_rf = rf_model.predict(X_test)
y_pred_probs_rf = rf_model.predict_proba(X_test)

In [22]:
# Evaluation Metrics
print('XGBoost Model Log Loss: ', log_loss(y_test, y_pred_probs_xgb))
print('XGBoost Model Accuracy:', accuracy_score(y_test, preds_xgb))
print('XGBoost Model Precision:', precision_score(y_test, preds_xgb, average="weighted"))
print('XGBoost Model Recall:', recall_score(y_test, preds_xgb, average="weighted"))

print('Random Forest Model Log Loss: ', log_loss(y_test, y_pred_probs_rf))
print('Random Forest Model Accuracy:', accuracy_score(y_test, preds_rf))
print('Random Forest Model Precision:', precision_score(y_test, preds_rf, average="weighted"))
print('Random Forest Model Recall:', recall_score(y_test, preds_rf, average="weighted"))

XGBoost Model Log Loss:  0.45434760282351466
XGBoost Model Accuracy: 0.8366366366366367
XGBoost Model Precision: 0.8317562597771155
XGBoost Model Recall: 0.8366366366366367
Random Forest Model Log Loss:  0.42802261336603875
Random Forest Model Accuracy: 0.8342342342342343
Random Forest Model Precision: 0.8327412726942099
Random Forest Model Recall: 0.8342342342342343


In [23]:
# Best XGBoost Model with tuned hyperparameters
xgb_params = {'n_estimators': 607, 'learning_rate': 0.04191844445257235,
              'max_depth': 6, 'subsample': 0.7079706225468251,
              'colsample_bytree': 0.16799013289247494, 'min_child_weight': 17}

In [24]:
xgb_model_tuned = XGBClassifier(**xgb_params, random_state=RANDOM_SEED)
xgb_model_tuned.fit(X, y)

In [25]:
preds_xgb_tuned = xgb_model_tuned.predict(X_test)
y_pred_probs_xgb_tuned = xgb_model_tuned.predict_proba(X_test)


In [None]:

# Print the evaluation metrics for the tuned XGBoost model
print('\nTuned XGBoost Model Log Loss: ', log_loss(y_test, y_pred_probs_xgb_tuned))
print('Tuned XGBoost Model Accuracy:', accuracy_score(y_test, preds_xgb_tuned))
print('Tuned XGBoost Model Precision:', precision_score(y_test, preds_xgb_tuned, average="weighted"))
print('Tuned XGBoost Model Recall:', recall_score(y_test, preds_xgb_tuned, average="weighted"))

In [27]:
test_IDs = test_df['id']
test_df = test_df.drop(['id'], axis=1)
y_pred_probs_tuned = xgb_model_tuned.predict_proba(test_df)

In [28]:
submission_labels = ["Status_C", "Status_CL", "Status_D"]
submission_df = pd.DataFrame({"id": test_IDs, **dict(zip(submission_labels, y_pred_probs_tuned.T))})

In [29]:
submission_df.head()

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.396452,0.030772,0.572776
1,7906,0.417996,0.299414,0.282591
2,7907,0.009435,0.006295,0.984271
3,7908,0.980863,0.003076,0.016061
4,7909,0.854353,0.068162,0.077484


In [30]:
submission_df.to_csv("submission.csv",index=False)