# Exercise 8

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import log_loss

train_path = "/content/train.csv"
test_path = "/content/test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

categorical_cols = ["Drug", "Ascites", "Hepatomegaly", "Spiders"]
numerical_cols = ["Cholesterol", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

train_df[categorical_cols] = train_df[categorical_cols].fillna(train_df[categorical_cols].mode().iloc[0])
test_df[categorical_cols] = test_df[categorical_cols].fillna(test_df[categorical_cols].mode().iloc[0])

train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].mean())
test_df[numerical_cols] = test_df[numerical_cols].fillna(test_df[numerical_cols].mean())

label_encoders = {}
categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"]
for col in categorical_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

target_encoder = LabelEncoder()
train_df["Status"] = target_encoder.fit_transform(train_df["Status"])

numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin", "Stage"]
scaler = MinMaxScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

X = train_df.drop(columns=["id", "Status"])
y = train_df["Status"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_dist = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "gamma": [0, 0.1, 0.2, 0.3]
}

clf = XGBClassifier(objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False, random_state=42)
random_search = RandomizedSearchCV(clf, param_dist, n_iter=30, cv=5, scoring='neg_log_loss', verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

best_clf = random_search.best_estimator_
y_val_pred_proba = best_clf.predict_proba(X_val)
val_log_loss = log_loss(y_val, y_val_pred_proba)
print("Validation Log Loss:", val_log_loss)

test_proba = best_clf.predict_proba(test_df.drop(columns=["id"]))

submission = pd.DataFrame({
    "id": test_df["id"],
    "Status_C": test_proba[:, 0],
    "Status_CL": test_proba[:, 1],
    "Status_D": test_proba[:, 2],
})

submission.to_csv("sample_submission.csv", index=False)
print("Submission file saved as submission.csv")
print("Best Parameters:", random_search.best_params_)