# XGB kaggle comp

In [1]:
# Load basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stats
import time

# XGB libraries
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.preprocessing import StandardScaler


# Import data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sample_sub =  pd.read_csv("data/sample_submission.csv")

In [2]:
# Remove NA column from training data
train_df = train_df.drop(columns='Unnamed: 12')

# Assign features
X = train_df.drop(columns='DIC')
y = train_df['DIC']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=808) 

# Scale the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X.columns)

In [8]:
# Determine best number of trees using early stopping
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1, 
    early_stopping_rounds=50, 
    eval_metric="rmse",  # Use RMSE for regression
    random_state=808)

# Fit model
xgb.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best number of trees
best_ntrees = xgb.best_iteration
print(f"Best number of trees {best_ntrees}")

Best number of trees 305


In [None]:
# Initialize second XGB to tune learning rate
xgb2 = XGBRegressor(
    n_estimators=best_ntrees,
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb2.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Create a parameter distribution for learning rate
param_dist = {
    "learning_rate": uniform(0.01, 0.3), 
}

# Set up RandomizedSearchCV
rs = RandomizedSearchCV(
    xgb2, param_dist, n_iter=20, scoring='neg_root_mean_squared_error', 
    cv=3, verbose=0, random_state=808, n_jobs=8
)

# Fit random search
rs.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best number of learners
best_lr = rs.best_params_['learning_rate']
print(f"Best learning rate: {best_lr:.4f}")

Best learning rate: 0.1561


In [12]:
# Tune tree specific parameters
xgb3 = XGBRegressor(
    n_estimators = best_ntrees,
    learning_rate = best_lr, 
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb3.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Second param dist
param_dist2 = {
    "max_depth": randint(3, 10), 
    "min_child_weight": randint(1, 10),
    "gamma": uniform(0.05, 0.05)
}

# Set up RandomizedSearchCV
rs2 = RandomizedSearchCV(
    xgb3, param_dist2, 
    n_iter=20, scoring='neg_root_mean_squared_error', 
    cv=3, verbose=False, random_state=808, n_jobs=10
)

# Run random search
rs2.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best tree parameters
best_tree_params = rs2.best_params_
print(f"Best tree parameters: {best_tree_params}")

Best tree parameters: {'gamma': 0.06711966552658265, 'max_depth': 5, 'min_child_weight': 3}


In [13]:
# Tune stochastic components
xgb4 = XGBRegressor(
    n_estimators=best_ntrees,
    learning_rate=best_lr,
    **best_tree_params, 
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb4.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Third param dist
param_dist3 = {
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5) 
}

# Set up RandomizedSearchCV
rs3 = RandomizedSearchCV(
    xgb4, param_dist3, n_iter=20, scoring='neg_root_mean_squared_error', 
    cv=3, verbose=False, random_state=808, n_jobs=10
)

# Run random search
rs3.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best stochastic parameters
best_stochastic_params = rs3.best_params_
print(f"Best stochastic parameters: {best_stochastic_params}")

Best stochastic parameters: {'colsample_bytree': 0.6673574366507202, 'subsample': 0.8046025228990683}


In [3]:
test_df.columns

Index(['id', 'Lat_Dec', 'Lon_Dec', 'NO2uM', 'NO3uM', 'NH3uM', 'R_TEMP',
       'R_Depth', 'R_Sal', 'R_DYNHT', 'R_Nuts', 'R_Oxy_micromol.Kg', 'PO4uM',
       'SiO3uM', 'TA1', 'Salinity1', 'Temperature_degC'],
      dtype='object')

In [None]:


# Initialize fifth XGB model
xgb5 = XGBRegressor(
    n_estimators=best_ntrees,
    learning_rate=best_lr,
    **best_tree_params, 
    **best_stochastic_params, 
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb5.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)




Mean Squared Error (MSE): 47.4414
Root Mean Squared Error (RMSE): 6.8878


In [None]:
# Fix column name error
test_df = test_df.rename(columns={'TA1':'TA1.x'})


In [25]:
# Prep testing data
X_test= test_df.copy()
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Predict on validation and testing data
y_pred_val = xgb5.predict(X_val_scaled)
y_pred_test = xgb5.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
val_mse = mean_squared_error(y_val, y_pred_val)
test_mse = mean_squared_error(y_val, y_pred_test)

# Calculate Root Mean Squared Error (RMSE)
val_rmse = np.sqrt(val_mse)
test_rmse = np.sqrt(test_mse)

# Print results
print(f"Validation RMSE: {val_rmse:.4f}, MSE: {val_mse:.4f}")
print(f"Test RMSE: {val_rmse:.4f}, MSE: {val_mse:.4f}")


ValueError: Found input variables with inconsistent numbers of samples: [437, 485]

In [17]:

# Get feature importance
feature_importance = pd.DataFrame({'Feature': X_train_scaled.columns, 'Importance': xgb5.feature_importances_})

# Sort by importance
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)
feature_importance

Unnamed: 0,Feature,Importance
12,PO4uM,0.494485
13,SiO3uM,0.370204
11,R_Oxy_micromol.Kg,0.08277
7,R_Depth,0.015209
8,R_Sal,0.011374
14,TA1.x,0.011063
4,NO3uM,0.010024
15,Salinity1,0.003528
6,R_TEMP,0.000303
3,NO2uM,0.000191


## Trying `hyperopt`

[Documentation here](https://hyperopt.github.io/hyperopt/)

In [3]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK


# Define the objective function
def objective(params):
    model = XGBRegressor(
        n_estimators=int(params["n_estimators"]),
        learning_rate=params["learning_rate"],
        max_depth=int(params["max_depth"]),
        min_child_weight=params["min_child_weight"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        random_state=808
    )
    
    # Train the model
    model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], 
            early_stopping_rounds=50, verbose=0)

    # Make predictions
    y_pred = model.predict(X_val_scaled)

    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    return {'loss': rmse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    "n_estimators": hp.quniform("n_estimators", 100, 1000, 10),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
    "max_depth": hp.quniform("max_depth", 3, 15, 1),
    "min_child_weight": hp.uniform("min_child_weight", 1, 10),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}

# Run Hyperopt optimization
trials = Trials()
best_params = fmin(
    fn=objective,         # Objective function
    space=space,          # Hyperparameter space
    algo=tpe.suggest,     # Tree Parzen Estimator (TPE) algorithm
    max_evals=50,         # Number of evaluations
    trials=trials,        # Track results
    rstate=np.random.default_rng(808)  # Ensure reproducibility
)

print("Best Hyperparameters:", best_params)


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




  2%|▏         | 1/50 [00:01<00:54,  1.12s/trial, best loss: 6.637559696761109]




  4%|▍         | 2/50 [00:01<00:40,  1.19trial/s, best loss: 6.637559696761109]




  6%|▌         | 3/50 [00:02<00:30,  1.56trial/s, best loss: 6.637559696761109]




  8%|▊         | 4/50 [00:02<00:23,  1.98trial/s, best loss: 6.637559696761109]




 10%|█         | 5/50 [00:02<00:18,  2.49trial/s, best loss: 6.637559696761109]




 12%|█▏        | 6/50 [00:02<00:15,  2.88trial/s, best loss: 6.637559696761109]




 14%|█▍        | 7/50 [00:03<00:15,  2.83trial/s, best loss: 6.637559696761109]




 16%|█▌        | 8/50 [00:03<00:13,  3.06trial/s, best loss: 6.637559696761109]




 18%|█▊        | 9/50 [00:03<00:14,  2.90trial/s, best loss: 6.637559696761109]




 22%|██▏       | 11/50 [00:04<00:12,  3.19trial/s, best loss: 6.637559696761109]





 24%|██▍       | 12/50 [00:04<00:12,  3.02trial/s, best loss: 6.637559696761109]




 26%|██▌       | 13/50 [00:05<00:11,  3.20trial/s, best loss: 6.637559696761109]




 28%|██▊       | 14/50 [00:05<00:10,  3.32trial/s, best loss: 6.637559696761109]




 32%|███▏      | 16/50 [00:06<00:09,  3.43trial/s, best loss: 6.637559696761109]





 34%|███▍      | 17/50 [00:06<00:14,  2.30trial/s, best loss: 6.637559696761109]




 36%|███▌      | 18/50 [00:08<00:24,  1.29trial/s, best loss: 6.637559696761109]




 40%|████      | 20/50 [00:08<00:15,  1.91trial/s, best loss: 6.637559696761109]





 42%|████▏     | 21/50 [00:10<00:20,  1.44trial/s, best loss: 6.637559696761109]




 44%|████▍     | 22/50 [00:11<00:24,  1.14trial/s, best loss: 6.637559696761109]




 46%|████▌     | 23/50 [00:11<00:20,  1.33trial/s, best loss: 6.637559696761109]




 48%|████▊     | 24/50 [00:12<00:15,  1.64trial/s, best loss: 6.637559696761109]




 52%|█████▏    | 26/50 [00:13<00:12,  1.97trial/s, best loss: 6.637559696761109]





 54%|█████▍    | 27/50 [00:13<00:11,  2.05trial/s, best loss: 6.637559696761109]




 56%|█████▌    | 28/50 [00:14<00:13,  1.61trial/s, best loss: 6.599128203616501]




 58%|█████▊    | 29/50 [00:14<00:11,  1.78trial/s, best loss: 6.599128203616501]




 60%|██████    | 30/50 [00:15<00:10,  1.91trial/s, best loss: 6.599128203616501]




 62%|██████▏   | 31/50 [00:15<00:10,  1.90trial/s, best loss: 6.599128203616501]




 64%|██████▍   | 32/50 [00:16<00:08,  2.24trial/s, best loss: 6.599128203616501]




 66%|██████▌   | 33/50 [00:16<00:07,  2.31trial/s, best loss: 6.599128203616501]




 68%|██████▊   | 34/50 [00:16<00:06,  2.54trial/s, best loss: 6.599128203616501]




 70%|███████   | 35/50 [00:17<00:08,  1.82trial/s, best loss: 6.599128203616501]




 74%|███████▍  | 37/50 [00:18<00:05,  2.24trial/s, best loss: 6.599128203616501]





 76%|███████▌  | 38/50 [00:21<00:13,  1.14s/trial, best loss: 6.594527372109236]




 78%|███████▊  | 39/50 [00:22<00:13,  1.20s/trial, best loss: 6.594527372109236]




 80%|████████  | 40/50 [00:22<00:09,  1.05trial/s, best loss: 6.594527372109236]




 82%|████████▏ | 41/50 [00:23<00:08,  1.12trial/s, best loss: 6.594527372109236]




 84%|████████▍ | 42/50 [00:24<00:06,  1.29trial/s, best loss: 6.594527372109236]




 86%|████████▌ | 43/50 [00:24<00:04,  1.56trial/s, best loss: 6.594527372109236]




 88%|████████▊ | 44/50 [00:24<00:03,  1.73trial/s, best loss: 6.594527372109236]




 90%|█████████ | 45/50 [00:25<00:02,  2.01trial/s, best loss: 6.594527372109236]




 92%|█████████▏| 46/50 [00:25<00:01,  2.31trial/s, best loss: 6.594527372109236]




 94%|█████████▍| 47/50 [00:26<00:01,  1.72trial/s, best loss: 6.594527372109236]




 96%|█████████▌| 48/50 [00:26<00:00,  2.04trial/s, best loss: 6.594527372109236]




 98%|█████████▊| 49/50 [00:27<00:00,  2.00trial/s, best loss: 6.594527372109236]




100%|██████████| 50/50 [00:28<00:00,  1.78trial/s, best loss: 6.594527372109236]
Best Hyperparameters: {'colsample_bytree': 0.6316001127441728, 'learning_rate': 0.019108004939411134, 'max_depth': 12.0, 'min_child_weight': 1.6338627280091838, 'n_estimators': 720.0, 'subsample': 0.9600428319053428}


Best Hyperparameters: 'colsample_bytree': 0.6316001127441728, 'learning_rate': 0.019108004939411134, 'max_depth': 12.0, 'min_child_weight': 1.6338627280091838, 'n_estimators': 720.0, 'subsample': 0.9600428319053428

In [6]:
# Convert integer-based hyperparameters explicitly
best_params["n_estimators"] = int(best_params["n_estimators"])
best_params["max_depth"] = int(best_params["max_depth"])

# Initialize best hyperopt model
xgb_hyper = XGBRegressor(**best_params, eval_metric='rmse', random_state=808)

# Fit model
xgb_hyper.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

In [8]:
# Predict on validation and testing data
y_pred_val = xgb_hyper.predict(X_val_scaled)

# Calculate Mean Squared Error (MSE)
val_mse = mean_squared_error(y_val, y_pred_val)

# Calculate Root Mean Squared Error (RMSE)
val_rmse = np.sqrt(val_mse)

# Print results
print(f"Validation RMSE: {val_rmse:.4f}, MSE: {val_mse:.4f}")


Validation RMSE: 6.5945, MSE: 43.4878


In [9]:
# Get feature importance
feature_importance = pd.DataFrame({'Feature': X_train_scaled.columns, 'Importance': xgb_hyper.feature_importances_})

# Sort by importance
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)
feature_importance

Unnamed: 0,Feature,Importance
13,SiO3uM,0.59763
12,PO4uM,0.250533
11,R_Oxy_micromol.Kg,0.052185
8,R_Sal,0.033035
14,TA1.x,0.022122
4,NO3uM,0.014712
6,R_TEMP,0.013343
15,Salinity1,0.008189
7,R_Depth,0.00667
16,Temperature_degC,0.000798
