In this notebook, we will train a first-level XGBoost model using descriptive text features, FastText discourse embeddings, and TF-IDF + UMAP essay embeddings obtained during the preparation stage.

We will then apply the model to generate first-level predictions for the holdout and test sets. These predictions will serve as one of the features for training a second-level model.

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import json
import pickle
import os
import joblib

import pandas as pd
import numpy as np

from sklearn.metrics import precision_score, recall_score, f1_score, log_loss
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
import optuna

from tqdm import tqdm
from collections import Counter

In [5]:
BASIC_PATH = '/content/gdrive/MyDrive/ML/projects/feedback-prize/'
MODEL_PATH = '1st_level_models/'
SAVE_TRANSFORMED_DATASETS = '1st_level_transformed_data/'
SAVE_DATASETS_FOLDER = '1st_level_preds/'

Load the preprocessed datasets for training.

In [6]:
train_df = pd.read_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'train_df_transformed.csv')
val_df = pd.read_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'val_df_transformed.csv')
holdout_data = pd.read_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'holdout_data_transformed.csv')
test_data = pd.read_csv(BASIC_PATH+SAVE_TRANSFORMED_DATASETS+'test_data_transformed.csv')

In [7]:
train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_type,target,discourse_len,discourse_num_long_words,discourse_num_short_words,discourse_noun_count,discourse_adj_count,discourse_pnoun_count,...,umap_emb_54,umap_emb_55,umap_emb_56,umap_emb_57,umap_emb_58,umap_emb_59,umap_emb_60,umap_emb_61,umap_emb_62,umap_emb_63
0,ed3a833a2f49,013B9AA6B9DB,Lead,0,28,0.0,0.5,0.166667,0.0,0.166667,...,5.197482,4.790548,5.558707,4.091189,3.953247,3.980204,4.616942,6.128882,4.579642,4.220257
1,c2203a58aa5c,013B9AA6B9DB,Position,0,101,0.1,0.65,0.1,0.0,0.0,...,5.197482,4.790548,5.558707,4.091189,3.953247,3.980204,4.616942,6.128882,4.579642,4.220257
2,804266501124,013B9AA6B9DB,Evidence,0,190,0.108108,0.567568,0.243243,0.054054,0.027027,...,5.197482,4.790548,5.558707,4.091189,3.953247,3.980204,4.616942,6.128882,4.579642,4.220257
3,9e0d25faca07,013B9AA6B9DB,Counterclaim,0,65,0.153846,0.307692,0.153846,0.076923,0.076923,...,5.197482,4.790548,5.558707,4.091189,3.953247,3.980204,4.616942,6.128882,4.579642,4.220257
4,7a01d9cb379a,013B9AA6B9DB,Rebuttal,2,16,0.0,1.0,0.0,0.0,0.0,...,5.197482,4.790548,5.558707,4.091189,3.953247,3.980204,4.616942,6.128882,4.579642,4.220257


In [8]:
COLS_TO_DROP = ['discourse_id', 'essay_id', 'target'] # specify the columns to drop, retaining only the features for the training set.
TARGET = 'target'
CAT_FEATURES = ['discourse_type']

Preprocess categorical columns for the model.

In [9]:
train_df[CAT_FEATURES] = train_df[CAT_FEATURES].astype('category')
val_df[CAT_FEATURES] = val_df[CAT_FEATURES].astype('category')
holdout_data[CAT_FEATURES] = holdout_data[CAT_FEATURES].astype('category')
test_data[CAT_FEATURES] = test_data[CAT_FEATURES].astype('category')

Prepare datasets for XGBoost and set the class weights.

In [10]:
class_weights = {0: 1, 1: 1.25, 2: 1.75}

In [11]:
dtrain = xgb.DMatrix(train_df.drop(COLS_TO_DROP, axis = 1),
                     label = train_df[TARGET],
                     enable_categorical = True,
                     weight = train_df[TARGET].map(class_weights))

dval = xgb.DMatrix(val_df.drop(COLS_TO_DROP, axis = 1),
                   label = val_df[TARGET],
                   enable_categorical = True,
                   weight = val_df[TARGET].map(class_weights))

holdout = xgb.DMatrix(holdout_data.drop(COLS_TO_DROP, axis = 1),
                   enable_categorical = True)

test = xgb.DMatrix(test_data.drop(COLS_TO_DROP, axis = 1),
                   enable_categorical = True)

Configure Optuna optimization and initiate the search for optimal hyperparameters.

In [12]:
def objective(trial):

    xgb_params = {
        'objective': 'multi:softprob',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 6, 30),
        'learning_rate': 0.01,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step = 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step = 0.1),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'alpha': trial.suggest_float('alpha', 0, 3),
        'lambda': trial.suggest_float('lambda', 0, 3),
        'tree_method': 'hist',
        'random_state': 97,
        'verbosity': 1,
        'device': 'cuda'
    }

    model = xgb.train(params = xgb_params,
                      dtrain = dtrain,
                      evals = [(dval, 'validation')],
                      num_boost_round = 2000,
                      early_stopping_rounds = 20,
                      verbose_eval = False)

    return model.best_score

In [13]:
study = optuna.create_study(direction = 'minimize', study_name = 'XGBoost parameters')

[I 2024-10-06 05:24:16,094] A new study created in memory with name: XGBoost parameters


In [14]:
def callback(study, trial):
  pbar.update(1)

In [15]:
N_trials = 200

with tqdm(total = N_trials, desc = "Optuna Optimization", dynamic_ncols = True, bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]') as pbar:
    study.optimize(objective, n_trials = N_trials, callbacks = [callback])

Optuna Optimization:   0%|          | 0/200 [00:00<?][I 2024-10-06 05:25:20,564] Trial 0 finished with value: 0.7155000427302075 and parameters: {'max_depth': 7, 'min_child_weight': 17, 'colsample_bytree': 1.0, 'subsample': 0.7, 'gamma': 1.665081507430091, 'alpha': 2.241160706365798, 'lambda': 2.825595615881582}. Best is trial 0 with value: 0.7155000427302075.
Optuna Optimization:   0%|          | 1/200 [00:42<2:20:49][I 2024-10-06 05:25:52,503] Trial 1 finished with value: 0.7165400737564436 and parameters: {'max_depth': 8, 'min_child_weight': 26, 'colsample_bytree': 0.7, 'subsample': 0.8, 'gamma': 3.829626175270637, 'alpha': 2.323942974623124, 'lambda': 2.8426622304122864}. Best is trial 0 with value: 0.7155000427302075.
Optuna Optimization:   1%|          | 2/200 [01:14<1:59:41][I 2024-10-06 05:26:24,997] Trial 2 finished with value: 0.7173134049599011 and parameters: {'max_depth': 6, 'min_child_weight': 18, 'colsample_bytree': 0.7, 'subsample': 0.9, 'gamma': 1.319417712101073, 'alp

In [16]:
print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  Value: 0.711005503694219
  Params:
    max_depth: 9
    min_child_weight: 19
    colsample_bytree: 0.6
    subsample: 0.7
    gamma: 3.227614368838558
    alpha: 0.1381184475569871
    lambda: 0.6671661482124529


Train the XGBoost model using the best hyperparameters found by Optuna, then save the trained model for future use.

In [17]:
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': 9,
    'min_child_weight': 19,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6,
    'subsample': 0.7,
    'gamma': 3.227614368838558,
    'alpha': 0.1381184475569871,
    'lambda': 0.6671661482124529,
    'tree_method': 'hist',
    'random_state': 97,
    'verbosity': 1,
    'device': "cuda"
    }

# xgb_params = {
#     'objective': 'multi:softprob',
#     'num_class': 3,
#     'eval_metric': 'mlogloss',
#     'max_depth': 6,
#     'min_child_weight': 12,
#     'learning_rate': 0.01,
#     'colsample_bytree': 0.6,
#     'subsample': 0.8,
#     'tree_method': 'hist',
#     'random_state': 97,
#     'verbosity': 1,
#     'device': "cuda"
#     }

evals = [(dtrain, 'train'), (dval, 'validation')]
model = xgb.train(params = xgb_params,
                  dtrain = dtrain,
                  evals = evals,
                  num_boost_round = 2000,
                  early_stopping_rounds = 20,
                  verbose_eval = 100)

[0]	train-mlogloss:1.09374	validation-mlogloss:1.09471
[100]	train-mlogloss:0.78656	validation-mlogloss:0.86872
[200]	train-mlogloss:0.65126	validation-mlogloss:0.78695
[300]	train-mlogloss:0.57729	validation-mlogloss:0.75054
[400]	train-mlogloss:0.53113	validation-mlogloss:0.73245
[500]	train-mlogloss:0.49989	validation-mlogloss:0.72318
[600]	train-mlogloss:0.47744	validation-mlogloss:0.71787
[700]	train-mlogloss:0.46106	validation-mlogloss:0.71492
[800]	train-mlogloss:0.44902	validation-mlogloss:0.71302
[900]	train-mlogloss:0.44079	validation-mlogloss:0.71188
[1000]	train-mlogloss:0.43490	validation-mlogloss:0.71136
[1100]	train-mlogloss:0.42996	validation-mlogloss:0.71103
[1106]	train-mlogloss:0.42980	validation-mlogloss:0.71104


In [18]:
#model.save_model(BASIC_PATH+MODEL_PATH+'xgboost_model.json')

In [19]:
xgb_model = xgb.Booster()
xgb_model.load_model(BASIC_PATH+MODEL_PATH+'xgboost_model.json')

In [20]:
holdout_probs = xgb_model.predict(holdout)
holdout_preds = holdout_probs.argmax(-1)
Counter(holdout_preds)

Counter({0: 3686, 2: 722, 1: 1513})

In [21]:
test_probs = xgb_model.predict(test)
test_preds = test_probs.argmax(-1)
Counter(test_preds)

Counter({0: 4524, 2: 921, 1: 1937})

In [22]:
print('Holdout metrics:')
print(f"Loss: {log_loss(holdout_data[TARGET], holdout_probs)}")
print(f"Precision: {precision_score(holdout_data[TARGET], holdout_preds, average = 'macro')}")
print(f"Recall: {recall_score(holdout_data[TARGET], holdout_preds, average = 'macro')}")
print(f"F1: {f1_score(holdout_data[TARGET], holdout_preds, average = 'macro')}")

print('='*50)

print('Test metrics:')
print(f"Loss: {log_loss(test_data[TARGET], test_probs)}")
print(f"Precision: {precision_score(test_data[TARGET], test_preds, average = 'macro')}")
print(f"Recall: {recall_score(test_data[TARGET], test_preds, average = 'macro')}")
print(f"F1: {f1_score(test_data[TARGET], test_preds, average = 'macro')}")

Holdout metrics:
Loss: 0.676053684247363
Precision: 0.6661420733276419
Recall: 0.6303884310539692
F1: 0.6426847596681292
Test metrics:
Loss: 0.6884532609462317
Precision: 0.6384198976741872
Recall: 0.6133063349087662
F1: 0.6217734187893583


We will retain some of the key text features and append the XGBoost model predictions to the holdout and test datasets. The updated datasets will then be saved for future use in training a second-level model.

In [24]:
COLS_TO_DROP = [col for col in holdout_data.columns if 'emb' in col]

In [26]:
holdout_data['1st_level_xgb_preds'] = holdout_preds
test_data['1st_level_xgb_preds'] = test_preds

In [27]:
holdout_data.drop(COLS_TO_DROP, axis = 1, inplace = True)
test_data.drop(COLS_TO_DROP, axis = 1, inplace = True)

In [32]:
# holdout_data.to_csv(BASIC_PATH+SAVE_DATASETS_FOLDER+'holdout_1st_level_xgb_preds.csv', index = False)
# test_data.to_csv(BASIC_PATH+SAVE_DATASETS_FOLDER+'test_1st_level_xgb_preds.csv', index = False)