In [26]:
from EDA_script import *
# train_x_raw = pd.read_csv("../01-Data/X_train.csv", low_memory = True, index_col=0)
# train_y_raw = pd.read_csv("../01-Data/y_train.csv", low_memory = True, index_col=0)
# test_x_raw = pd.read_csv("../01-Data/X_test.csv", low_memory=True, index_col=0)

# df_train = pd.DataFrame(train_x_raw)
# df_test = pd.DataFrame(test_x_raw)
# df_y = pd.DataFrame(train_y_raw)

In [None]:
[]

In [27]:
## add to EDA
for column in set(df_train.columns) - set(df_test.columns):
    df_test[column] = 0

df_test = df_test[df_train.columns]

# Simple Model Run
##  xgBoost model set up

In [28]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

# Remap labels
label_mapping = {-1: 0, 1: 1, 2: 2, 3: 3, 4: 4}
df_y = df_y.replace(label_mapping)

## Bayesian Optimization

In [5]:
def xgb_cv_score(max_depth, gamma, colsample_bytree, subsample, eta, reg_lambda, reg_alpha, min_child_weight):
    """
    Computes the cross-validated log loss for given hyperparameter settings using Stratified K-Fold.
    """
    params = {
        'device': 'cuda',
        'max_depth': int(max_depth),
        'gamma': gamma,
        'colsample_bytree': colsample_bytree,
        'subsample': subsample,
        'eta': eta,
        'objective': 'multi:softprob',
        'num_class': 5,
        'eval_metric': 'mlogloss',
        'lambda': reg_lambda,
        'alpha': reg_alpha,
        'min_child_weight': min_child_weight,
        'verbosity': 0,
        'seed': 42
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, test_index in skf.split(df_train, df_y):
        xgb_train = xgb.DMatrix(df_train.iloc[train_index], label=df_y.iloc[train_index])
        xgb_valid = xgb.DMatrix(df_train.iloc[test_index], label=df_y.iloc[test_index])
        
        watchlist = [(xgb_train, 'train'), (xgb_valid, 'eval')]

        # Add early_stopping_rounds
        model = xgb.train(params, xgb_train, num_boost_round=500, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

        # Predict using the best iteration
        preds = model.predict(xgb_valid)
        #preds = model.predict(xgb_valid, ntree_limit=(model.best_iteration + 1) * params['num_class']

        log_loss_score = log_loss(df_y.iloc[test_index], preds, labels=list(range(5)))
        log_loss_scores.append(log_loss_score)

    return -np.mean(log_loss_scores)

# Define the hyperparameter bounds
pbounds = {
    'max_depth': (3, 10),
    'gamma': (0, 1),
    'colsample_bytree': (0.3, 0.9),
    'subsample': (0.3, 0.9),
    'eta': (0.01, 0.3),
    'reg_lambda': (1, 5),
    'reg_alpha': (0, 1),
    'min_child_weight': (1, 6),
}

# Perform Bayesian Optimization
optimizer = BayesianOptimization(f=xgb_cv_score, pbounds=pbounds, random_state=42, verbose=2)
optimizer.maximize(init_points=10, n_iter=20)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.8706  [0m | [0m0.5247   [0m | [0m0.2857   [0m | [0m0.732    [0m | [0m7.191    [0m | [0m1.78     [0m | [0m0.156    [0m | [0m1.232    [0m | [0m0.8197   [0m |
| [0m2        [0m | [0m-0.888   [0m | [0m0.6607   [0m | [0m0.2153   [0m | [0m0.02058  [0m | [0m9.789    [0m | [0m5.162    [0m | [0m0.2123   [0m | [0m1.727    [0m | [0m0.41     [0m |
| [95m3        [0m | [95m-0.8598  [0m | [95m0.4825   [0m | [95m0.1622   [0m | [95m0.4319   [0m | [95m5.039    [0m | [95m4.059    [0m | [95m0.1395   [0m | [95m2.169    [0m | [95m0.5198   [0m |
| [0m4        [0m | [0m-0.876   [0m | [0m0.5736   [0m | [0m0.2377   [0m | [0m0.1997   [0m | [0m6.6      [0m | [0m3.962    [0m 

## CV xgboost train with best parameters

In [30]:
X_train, X_val, y_train, y_val = train_test_split(df_train, df_y, test_size=0.3, random_state=42)

# Best parameters from optimization
best_params = {
    'max_depth': int(optimizer.max['params']['max_depth']),
    'gamma': optimizer.max['params']['gamma'],
    'colsample_bytree': optimizer.max['params']['colsample_bytree'],
    'subsample': optimizer.max['params']['subsample'],
    'eta': optimizer.max['params']['eta'],
    'lambda': optimizer.max['params']['reg_lambda'],
    'alpha': optimizer.max['params']['reg_alpha'],
    'min_child_weight': optimizer.max['params']['min_child_weight'],
    'objective': 'multi:softprob',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'seed': 42
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

evals_result = {}
bst = xgb.train(best_params, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dval, 'val')],
                early_stopping_rounds=10, evals_result=evals_result, verbose_eval=True)

# Evaluate and print the final training and validation loss
train_last_eval = evals_result['train']['mlogloss'][-1]
val_last_eval = evals_result['val']['mlogloss'][-1]

print(f"Training Multiclass Logarithmic Loss: {train_last_eval}")
print(f"Validation Multiclass Logarithmic Loss: {val_last_eval}")

KeyboardInterrupt: 

## Generate Submission csv

In [33]:
optimizer.max['params']

{'colsample_bytree': 0.6280261676059677,
 'eta': 0.06360779210240283,
 'gamma': 0.9695846277645586,
 'max_depth': 8.425929763527801,
 'min_child_weight': 5.697494707820946,
 'reg_alpha': 0.8948273504276488,
 'reg_lambda': 3.3915999152443406,
 'subsample': 0.8531245410138701}

In [34]:

X_train, X_val, y_train, y_val = train_test_split(df_train, df_y, test_size=0.3, random_state=42)

adjusted_params = {
    'colsample_bytree': 0.6,  # Slightly lower to increase regularization
    'device': 'cuda',         # Keeping it as is for GPU acceleration
    'gamma': 3.5,               # Lowered to reduce overfitting by making the algorithm conservative
    'learning_rate': 0.05,    # Lowered for finer steps towards convergence
    'max_depth': 7,           # Slightly reduced to control complexity and overfitting
    'n_estimators': 1000,     # Increased to compensate for the lower learning rate
    'subsample': 0.8,         # Increased to use a bit more data for each tree
    'min_child_weight': 5,    # Lowered to consider splits with fewer samples
    'alpha': 1.5,               # Slightly increased L1 regularization
    'lambda': 2.5,              # Slightly reduced L2 regularization to balance with L1
    'objective': 'multi:softprob',  # No change, appropriate for multi-class classification
    'num_class': 5,           # Ensure this matches your actual number of classes
    'eval_metric': 'mlogloss',  # No change, as we're focusing on log loss
}


dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

evals_result = {}
bst = xgb.train(adjusted_params, dtrain, num_boost_round=800, evals=[(dtrain, 'train'), (dval, 'val')],
                early_stopping_rounds=10, evals_result=evals_result, verbose_eval=True)

# Evaluate and print the final training and validation loss
train_last_eval = evals_result['train']['mlogloss'][-1]
val_last_eval = evals_result['val']['mlogloss'][-1]

print(f"Training Multiclass Logarithmic Loss: {train_last_eval}")
print(f"Validation Multiclass Logarithmic Loss: {val_last_eval}")

[0]	train-mlogloss:1.57397	val-mlogloss:1.57570
[1]	train-mlogloss:1.53936	val-mlogloss:1.54304
[2]	train-mlogloss:1.50420	val-mlogloss:1.50996
[3]	train-mlogloss:1.47389	val-mlogloss:1.48096
[4]	train-mlogloss:1.44335	val-mlogloss:1.45206
[5]	train-mlogloss:1.41496	val-mlogloss:1.42526
[6]	train-mlogloss:1.38946	val-mlogloss:1.40130
[7]	train-mlogloss:1.36431	val-mlogloss:1.37772
[8]	train-mlogloss:1.33988	val-mlogloss:1.35509
[9]	train-mlogloss:1.31778	val-mlogloss:1.33462
[10]	train-mlogloss:1.29665	val-mlogloss:1.31510
[11]	train-mlogloss:1.27684	val-mlogloss:1.29695
[12]	train-mlogloss:1.25763	val-mlogloss:1.27888
[13]	train-mlogloss:1.24071	val-mlogloss:1.26295
[14]	train-mlogloss:1.22339	val-mlogloss:1.24713
[15]	train-mlogloss:1.20807	val-mlogloss:1.23296
[16]	train-mlogloss:1.19401	val-mlogloss:1.22009
[17]	train-mlogloss:1.18004	val-mlogloss:1.20713
[18]	train-mlogloss:1.16659	val-mlogloss:1.19457
[19]	train-mlogloss:1.15283	val-mlogloss:1.18187
[20]	train-mlogloss:1.14044	va

In [35]:
dtest = xgb.DMatrix(df_test, enable_categorical=True)
y_test_probs = bst.predict(dtest)
class_order = [0, 1, 2, 3, 4]
class_mapping = {class_label: f"Class{class_label}" for class_label in class_order}

submission_df = pd.DataFrame(y_test_probs, columns=class_mapping.values())
submission_df.columns = ['no answer', 'very important', 'quite important', 'not important', 'not at all important']
submission_df.insert(0, 'id', df_test.index)

# Save the submission file
submission_file = ('plzz_submission.csv')
submission_df.to_csv(submission_file, index=False)