In [1]:
from EDA_script2 import *

# train_x_raw = pd.read_csv("../01-Data/X_train.csv", low_memory = True, index_col=0)
# train_y_raw = pd.read_csv("../01-Data/y_train.csv", low_memory = True, index_col=0)
# test_x_raw = pd.read_csv("../01-Data/X_test.csv", low_memory=True, index_col=0)

# df_train = pd.DataFrame(train_x_raw)
# df_test = pd.DataFrame(test_x_raw)
# df_y = pd.DataFrame(train_y_raw)

# Simple Model Run
##  xgBoost model set up

In [10]:
# # one-hot encoding
# 
# # Apply get_dummies to the entire DataFrame, automatically encoding all categorical columns
# df_train_encoded = pd.get_dummies(df_train)
# df_test_encoded = pd.get_dummies(df_test)
# 
# # To ensure the training and test sets have the same columns after encoding, you might need to align them
# df_train_encoded, df_test_encoded = df_train_encoded.align(df_test_encoded, join='left', axis=1, fill_value=0)

In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import log_loss

label_mapping = {-1: 0, 1: 1, 2: 2, 3: 3, 4: 4}
df_y = df_y.replace(label_mapping)

dtrain = xgb.DMatrix(df_train, label=df_y, enable_categorical=True)
dtest = xgb.DMatrix(df_test, enable_categorical=True)

params = {
    'max_depth': 6,  
    'colsample_bytree': 0.7,  # Reduced
    'gamma': 0.4,  # Slightly increased
    'subsample': 0.6,  # Reduced
    'eta': 0.005,  # Reduced
    'objective': 'multi:softprob',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'lambda': 2,  # Increased L2 regularization
    'alpha': 0.2,  # Increased L1 regularization
    'min_child_weight': 3  # Added to control overfitting
    # 'n_estimators': 2000, # Uncomment and adjust as necessary
}


num_boost_round = 1000

## Cross Validation

In [None]:
from xgboost import cv

# params and num_boost_round provided above
# xgb_cv = cv(dtrain=dtrain, params=params, nfold=5,
#             num_boost_round=num_boost_round, early_stopping_rounds=10,
#             metrics="mlogloss", as_pandas=True, seed=123)
# 
# xgb_cv

Model!

In [4]:
from sklearn.model_selection import StratifiedKFold
from bayes_opt import BayesianOptimization
import xgboost as xgb
import numpy as np
import pandas as pd

# Assuming df_train, df_y are already defined and preprocessed

def xgb_cv_score(max_depth, gamma, colsample_bytree, subsample, eta, reg_lambda, reg_alpha, min_child_weight):
    """
    This function computes the cross-validated log loss using Stratified K-Fold
    for the given hyperparameter settings.
    """
    # Parameters that the optimizer can explore
    params = {
        'max_depth': int(max_depth),
        'gamma': gamma,
        'colsample_bytree': colsample_bytree,
        'subsample': subsample,
        'eta': eta,
        'objective': 'multi:softprob',
        'num_class': 5,
        'eval_metric': 'mlogloss',
        'lambda': reg_lambda,
        'alpha': reg_alpha,
        'min_child_weight': min_child_weight,
        'verbosity': 0,  # Quiet mode
        'seed': 42
    }

    # Convert to DMatrix for efficiency
    dtrain = xgb.DMatrix(df_train, label=df_y)

    # Stratified K-Fold Cross Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, test_index in skf.split(df_train, df_y):
        xgb_train = xgb.DMatrix(df_train.iloc[train_index], label=df_y.iloc[train_index])
        xgb_test = xgb.DMatrix(df_train.iloc[test_index], label=df_y.iloc[test_index])

        # Train the model
        model = xgb.train(params, xgb_train, num_boost_round=100)

        # Predict & evaluate
        preds = model.predict(xgb_test)
        log_loss_score = log_loss(df_y.iloc[test_index], preds, labels=list(range(5)))
        log_loss_scores.append(log_loss_score)

    # Return the negative mean log loss
    return -np.mean(log_loss_scores)

# Define the parameter bounds
pbounds = {
    'max_depth': (3, 10),
    'gamma': (0, 1),
    'colsample_bytree': (0.3, 0.9),
    'subsample': (0.3, 0.9),
    'eta': (0.01, 0.3),
    'reg_lambda': (1, 5),
    'reg_alpha': (0, 1),
    'min_child_weight': (1, 6),
}

optimizer = BayesianOptimization(f=xgb_cv_score, pbounds=pbounds, random_state=42, verbose=2)

# Optimize
optimizer.maximize(init_points=5, n_iter=10)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.8725  [0m | [0m0.5247   [0m | [0m0.2857   [0m | [0m0.732    [0m | [0m7.191    [0m | [0m1.78     [0m | [0m0.156    [0m | [0m1.232    [0m | [0m0.8197   [0m |
| [0m2        [0m | [0m-0.9     [0m | [0m0.6607   [0m | [0m0.2153   [0m | [0m0.02058  [0m | [0m9.789    [0m | [0m5.162    [0m | [0m0.2123   [0m | [0m1.727    [0m | [0m0.41     [0m |
| [95m3        [0m | [95m-0.852   [0m | [95m0.4825   [0m | [95m0.1622   [0m | [95m0.4319   [0m | [95m5.039    [0m | [95m4.059    [0m | [95m0.1395   [0m | [95m2.169    [0m | [95m0.5198   [0m |
| [0m4        [0m | [0m-0.8714  [0m | [0m0.5736   [0m | [0m0.2377   [0m | [0m0.1997   [0m | [0m6.6      [0m | [0m3.962    [0m 

In [5]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Assuming df_train is your features and df_y is your labels
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train, df_y, test_size=0.3, random_state=42)

# Create DMatrix for training and validation
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)

best_params = {
    'max_depth': int(optimizer.max['params']['max_depth']),
    'gamma': optimizer.max['params']['gamma'],
    'colsample_bytree': optimizer.max['params']['colsample_bytree'],
    'subsample': optimizer.max['params']['subsample'],
    'eta': optimizer.max['params']['eta'],
    'lambda': optimizer.max['params']['reg_lambda'],
    'alpha': optimizer.max['params']['reg_alpha'],
    'min_child_weight': optimizer.max['params']['min_child_weight'],
    'objective': 'multi:softprob',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'seed': 42
}

# Container for evaluation results
evals_result = {}

# Train the model
bst = xgb.train(best_params, dtrain, num_boost_round,
                    evals=[(dtrain, 'train'), (dval, 'val')],
                    evals_result=evals_result,
                    early_stopping_rounds=10,
                    verbose_eval=True)

# Retrieve the last evaluation metric for both train and val sets
train_last_eval = evals_result['train']['mlogloss'][-1]
val_last_eval = evals_result['val']['mlogloss'][-1]

print(f"Training Multiclass Logarithmic Loss: {train_last_eval}")
print(f"Validation Multiclass Logarithmic Loss: {val_last_eval}")

[0]	train-mlogloss:1.45452	val-mlogloss:1.45574
[1]	train-mlogloss:1.34216	val-mlogloss:1.34513
[2]	train-mlogloss:1.24903	val-mlogloss:1.25491
[3]	train-mlogloss:1.17460	val-mlogloss:1.18179
[4]	train-mlogloss:1.11779	val-mlogloss:1.12587
[5]	train-mlogloss:1.08120	val-mlogloss:1.09083
[6]	train-mlogloss:1.04510	val-mlogloss:1.05597
[7]	train-mlogloss:1.01296	val-mlogloss:1.02575
[8]	train-mlogloss:0.98879	val-mlogloss:1.00312
[9]	train-mlogloss:0.96552	val-mlogloss:0.98203
[10]	train-mlogloss:0.95327	val-mlogloss:0.97142
[11]	train-mlogloss:0.93943	val-mlogloss:0.95908
[12]	train-mlogloss:0.92721	val-mlogloss:0.94860
[13]	train-mlogloss:0.91389	val-mlogloss:0.93703
[14]	train-mlogloss:0.90490	val-mlogloss:0.92978
[15]	train-mlogloss:0.89559	val-mlogloss:0.92180
[16]	train-mlogloss:0.88847	val-mlogloss:0.91621
[17]	train-mlogloss:0.88148	val-mlogloss:0.91080
[18]	train-mlogloss:0.87510	val-mlogloss:0.90559
[19]	train-mlogloss:0.86918	val-mlogloss:0.90067
[20]	train-mlogloss:0.86285	va

## xgboost train

In [None]:
# evals_result = {}
# bst = xgb.train(params, dtrain, num_boost_round, 
#                 evals=[(dtrain, 'train')], evals_result=evals_result, 
#                 verbose_eval=False)
# print(f"Training Multiclass Logarithmic Loss: {evals_result['train']['mlogloss'][-1]}")
# 
# y_test_probs = bst.predict(dtest)
# 
# class_order = [0, 1, 2, 3, 4]
# class_mapping = {class_label: f"Class_{class_label}" for class_label in class_order}
# 
# y_train_probs = bst.predict(dtrain)
# val_log_loss = log_loss(df_y, y_train_probs, labels=class_order)
# print(f"Validation Multiclass Logarithmic Loss: {val_log_loss}")

## Generate Submission csv

In [15]:
# submission_df = pd.DataFrame(y_test_probs, columns=class_mapping.values())
# submission_df.columns = ['no answer', 'very important', 'quite important', 'not important', 'not at all important']
# submission_df.insert(0, 'id', df_test.index)
# 
# # Save the submission file
# submission_file = ('test_submission.csv')
# submission_df.to_csv(submission_file, index=False)