In [8]:
# # Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from EDA_script import *
# # Set options
# pd.options.display.max_rows = 999
# pd.options.display.max_columns = 999

# train_x_raw = pd.read_csv("../01-Data/X_train.csv", low_memory = True, index_col=0)
# train_y_raw = pd.read_csv("../01-Data/y_train.csv", low_memory = True, index_col=0)
# test_x_raw = pd.read_csv("../01-Data/X_test.csv", low_memory=True, index_col=0)

# df_train = pd.DataFrame(train_x_raw)
# df_test = pd.DataFrame(test_x_raw)
# df_y = pd.DataFrame(train_y_raw)

# Simple Model Run
##  xgBoost model set up

In [12]:
from sklearn.metrics import log_loss

label_mapping = {-1: 0, 1: 1, 2: 2, 3: 3, 4: 4}
df_y = df_y.replace(label_mapping)

dtrain = xgb.DMatrix(df_train, label=df_y, enable_categorical=True)
dtest = xgb.DMatrix(df_test, enable_categorical=True)

params = {
    'max_depth': 6,
    'eta': 0.01,
    'objective': 'multi:softprob',
    'num_class': 5,
    'eval_metric': 'mlogloss',
}

num_boost_round = 500

## Xgboost Model GridSearchCV

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from xgboost.sklearn import XGBClassifier

# Assume df_train, df_test, and df_y are already defined and preprocessed
# as per your setup

# Define the parameter grid to search
param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 500, 1000],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
}

# Instantiate the XGBClassifier (note: use objective 'multi:softprob' for multi-class)
xgb_model = XGBClassifier(objective='multi:softprob', num_class=5, eval_metric='mlogloss', use_label_encoder=False)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1, cv=3, verbose=2)

# Fit the GridSearchCV object to find the best parameters
grid_search.fit(df_train, df_y)

# Best parameter set found
print("Best parameters found: ", grid_search.best_params_)

# Best score
print("Best score (neg_log_loss): ", grid_search.best_score_)

# You can also use the best estimator directly to make predictions
# best_estimator = grid_search.best_estimator


## Cross Validation

In [13]:
from xgboost import cv

# params and num_boost_round provided above
xgb_cv = cv(dtrain=dtrain, params=params, nfold=10,
            num_boost_round=num_boost_round, early_stopping_rounds=10,
            metrics="mlogloss", as_pandas=True, seed=123)

xgb_cv

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.526709,0.00046,1.529876,0.000522
1,1.45672,0.000567,1.462868,0.000987
2,1.396577,0.00075,1.405702,0.001105
3,1.344042,0.000861,1.356013,0.001378
4,1.297831,0.001038,1.312425,0.001586
5,1.256703,0.000913,1.273919,0.001772
6,1.21984,0.000974,1.239523,0.001951
7,1.186644,0.001114,1.208749,0.002059
8,1.156575,0.001162,1.181099,0.002152
9,1.129472,0.001152,1.156434,0.002227


## Xgboost Training

In [14]:
evals_result = {}
bst = xgb.train(params, dtrain, num_boost_round, 
                evals=[(dtrain, 'train')], evals_result=evals_result, 
                verbose_eval=False)
print(f"Training Multiclass Logarithmic Loss: {evals_result['train']['mlogloss'][-1]}")

y_test_probs = bst.predict(dtest)

class_order = [0, 1, 2, 3, 4]
class_mapping = {class_label: f"Class_{class_label}" for class_label in class_order}

y_train_probs = bst.predict(dtrain)
val_log_loss = log_loss(df_y, y_train_probs, labels=class_order)
print(f"Validation Multiclass Logarithmic Loss: {val_log_loss}")

Training Multiclass Logarithmic Loss: 0.7816854378245771
Validation Multiclass Logarithmic Loss: 0.7816854274180483


## Generate Submission csv

In [15]:
submission_df = pd.DataFrame(y_test_probs, columns=class_mapping.values())
submission_df.columns = ['no answer', 'very important', 'quite important', 'not important', 'not at all important']
submission_df.insert(0, 'id', df_test.index)

# Save the submission file
# submission_file = ('submission.csv')
# submission_df.to_csv(submission_file, index=False)