# XGB kaggle comp

In [2]:
# Load basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stats
import time

# XGB libraries
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.preprocessing import StandardScaler


# Import data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sample_sub =  pd.read_csv("data/sample_submission.csv")

In [3]:
# Remove NA column from training data
train_df = train_df.drop(columns='Unnamed: 12')

# Assign features
X = train_df.drop(columns='DIC')
y = train_df['DIC']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=808) 

# Scale the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X.columns)

In [4]:
# Determine best number of trees using early stopping
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1, 
    early_stopping_rounds=50, 
    eval_metric="rmse",  # Use RMSE for regression
    random_state=808)

# Fit model
xgb.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best number of trees
best_ntrees = xgb.best_iteration
print(f"Best number of trees {best_ntrees}")

Best number of trees 305


In [5]:
# Initialize second XGB to tune learning rate
xgb2 = XGBRegressor(
    n_estimators=best_ntrees,
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb2.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Create a parameter distribution for learning rate
param_dist = {
    "learning_rate": uniform(0.01, 0.3), 
}

# Set up RandomizedSearchCV
rs = RandomizedSearchCV(
    xgb2, param_dist, n_iter=20, scoring='neg_root_mean_squared_error', 
    cv=3, verbose=0, random_state=808, n_jobs=8
)

# Fit random search
rs.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best number of learners
best_lr = rs.best_params_['learning_rate']
print(f"Best learning rate: {best_lr:.4f}")

Best learning rate: 0.1561


In [6]:
# Tune tree specific parameters
xgb3 = XGBRegressor(
    n_estimators = best_ntrees,
    learning_rate = best_lr, 
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb3.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Second param dist
param_dist2 = {
    "max_depth": randint(3, 10), 
    "min_child_weight": randint(1, 10),
    "gamma": uniform(0.05, 0.05)
}

# Set up RandomizedSearchCV
rs2 = RandomizedSearchCV(
    xgb3, param_dist2, 
    n_iter=20, scoring='neg_root_mean_squared_error', 
    cv=3, verbose=False, random_state=808, n_jobs=10
)

# Run random search
rs2.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best tree parameters
best_tree_params = rs2.best_params_
print(f"Best tree parameters: {best_tree_params}")

Best tree parameters: {'gamma': 0.06711966552658265, 'max_depth': 5, 'min_child_weight': 3}


In [7]:
# Tune stochastic components
xgb4 = XGBRegressor(
    n_estimators=best_ntrees,
    learning_rate=best_lr,
    **best_tree_params, 
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb4.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Third param dist
param_dist3 = {
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5) 
}

# Set up RandomizedSearchCV
rs3 = RandomizedSearchCV(
    xgb4, param_dist3, n_iter=20, scoring='neg_root_mean_squared_error', 
    cv=3, verbose=False, random_state=808, n_jobs=10
)

# Run random search
rs3.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

# Print best stochastic parameters
best_stochastic_params = rs3.best_params_
print(f"Best stochastic parameters: {best_stochastic_params}")

Best stochastic parameters: {'colsample_bytree': 0.6673574366507202, 'subsample': 0.8046025228990683}


In [8]:
test_df.columns

Index(['id', 'Lat_Dec', 'Lon_Dec', 'NO2uM', 'NO3uM', 'NH3uM', 'R_TEMP',
       'R_Depth', 'R_Sal', 'R_DYNHT', 'R_Nuts', 'R_Oxy_micromol.Kg', 'PO4uM',
       'SiO3uM', 'TA1', 'Salinity1', 'Temperature_degC'],
      dtype='object')

In [9]:
# Initialize fifth XGB model
xgb5 = XGBRegressor(
    n_estimators=best_ntrees,
    learning_rate=best_lr,
    **best_tree_params, 
    **best_stochastic_params, 
    early_stopping_rounds=50, 
    eval_metric="rmse", 
    random_state=808)

# Fit model
xgb5.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)


In [10]:
# Fix column name error
test_df = test_df.rename(columns={'TA1':'TA1.x'})


In [11]:
# Prep testing data
# X_test= test_df.copy()
# X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Predict on validation and testing data
y_pred_val = xgb5.predict(X_val_scaled)
# y_pred_test = xgb5.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
val_mse = mean_squared_error(y_val, y_pred_val)
# test_mse = mean_squared_error(y_val, y_pred_test)

# Calculate Root Mean Squared Error (RMSE)
val_rmse = np.sqrt(val_mse)
# test_rmse = np.sqrt(test_mse)

# Print results
print(f"Validation RMSE: {val_rmse:.4f}, MSE: {val_mse:.4f}")
# print(f"Test RMSE: {val_rmse:.4f}, MSE: {val_mse:.4f}")


Validation RMSE: 6.8878, MSE: 47.4414


In [12]:

# Get feature importance
feat_imp = pd.DataFrame({'Feature': X_train_scaled.columns, 'Importance': xgb5.feature_importances_})

# Sort by importance
feat_imp = feat_imp.sort_values(by="Importance", ascending=False)
feat_imp

Unnamed: 0,Feature,Importance
12,PO4uM,0.494485
13,SiO3uM,0.370204
11,R_Oxy_micromol.Kg,0.08277
7,R_Depth,0.015209
8,R_Sal,0.011374
14,TA1.x,0.011063
4,NO3uM,0.010024
15,Salinity1,0.003528
6,R_TEMP,0.000303
3,NO2uM,0.000191


## Trying `hyperopt`

[Documentation here](https://hyperopt.github.io/hyperopt/)

In [13]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK


# Define the objective function
def objective(params):
    model = XGBRegressor(
        n_estimators=int(params["n_estimators"]),
        learning_rate=params["learning_rate"],
        max_depth=int(params["max_depth"]),
        min_child_weight=params["min_child_weight"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        early_stopping_rounds=50,
        random_state=808
    )
    
    # Train the model
    model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

    # Make predictions
    y_pred = model.predict(X_val_scaled)

    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    return {'loss': rmse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    "n_estimators": hp.quniform("n_estimators", 100, 1000, 10),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
    "max_depth": hp.quniform("max_depth", 3, 15, 1),
    "min_child_weight": hp.uniform("min_child_weight", 1, 10),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}

# Run Hyperopt optimization
trials = Trials()
best_params = fmin(
    fn=objective,         # Objective function
    space=space,          # Hyperparameter space
    algo=tpe.suggest,     # Tree Parzen Estimator (TPE) algorithm
    max_evals=100,         # Number of evaluations
    trials=trials,        # Track results
    rstate=np.random.default_rng(808)  # Ensure reproducibility
)

print("Best Hyperparameters:", best_params)


100%|██████████| 100/100 [01:10<00:00,  1.42trial/s, best loss: 6.56245927590967]
Best Hyperparameters: {'colsample_bytree': 0.7638975686062371, 'learning_rate': 0.010894357239790119, 'max_depth': 10.0, 'min_child_weight': 1.0325852785384921, 'n_estimators': 960.0, 'subsample': 0.6894939803923603}


Best Hyperparameters: 'colsample_bytree': 0.6316001127441728, 'learning_rate': 0.019108004939411134, 'max_depth': 12.0, 'min_child_weight': 1.6338627280091838, 'n_estimators': 720.0, 'subsample': 0.9600428319053428

In [14]:
# Convert integer-based hyperparameters explicitly
best_params["n_estimators"] = int(best_params["n_estimators"])
best_params["max_depth"] = int(best_params["max_depth"])

# Initialize best hyperopt model
xgb_hyper = XGBRegressor(**best_params, eval_metric='rmse', random_state=808)

# Fit model
xgb_hyper.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

In [15]:
# Predict on validation and testing data
y_pred_hyper = xgb_hyper.predict(X_val_scaled)

# Calculate Mean Squared Error (MSE)
hyper_mse = mean_squared_error(y_val, y_pred_hyper)

# Calculate Root Mean Squared Error (RMSE)
hyper_rmse = np.sqrt(hyper_mse)

# Print results
print(f"Hyperopt Validation RMSE: {hyper_rmse:.4f}, MSE: {hyper_mse:.4f}")
print(f"Randomsearch Validation RMSE: {val_rmse:.4f}, MSE: {val_mse:.4f}")

Hyperopt Validation RMSE: 6.5625, MSE: 43.0659
Randomsearch Validation RMSE: 6.8878, MSE: 47.4414


In [16]:
# Get feature importance
feat_imp_hyper = pd.DataFrame({'Feature': X_train_scaled.columns, 'Importance': xgb_hyper.feature_importances_})

# Sort by importance
feat_imp_hyper = feat_imp_hyper.sort_values(by="Importance", ascending=False)
feat_imp_hyper

Unnamed: 0,Feature,Importance
13,SiO3uM,0.584601
12,PO4uM,0.314574
11,R_Oxy_micromol.Kg,0.037159
4,NO3uM,0.020441
14,TA1.x,0.018135
8,R_Sal,0.012078
15,Salinity1,0.009039
7,R_Depth,0.002625
3,NO2uM,0.00043
16,Temperature_degC,0.000219


In [17]:
feat_imp

Unnamed: 0,Feature,Importance
12,PO4uM,0.494485
13,SiO3uM,0.370204
11,R_Oxy_micromol.Kg,0.08277
7,R_Depth,0.015209
8,R_Sal,0.011374
14,TA1.x,0.011063
4,NO3uM,0.010024
15,Salinity1,0.003528
6,R_TEMP,0.000303
3,NO2uM,0.000191


## Trying hyperopt again but tuning more parameters

In [18]:
# Define the objective function
def objective(params):
    model = XGBRegressor(
        n_estimators=int(params["n_estimators"]),
        learning_rate=params["learning_rate"],
        max_depth=int(params["max_depth"]),
        min_child_weight=params["min_child_weight"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        gamma=params['gamma'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        early_stopping_rounds=50,
        random_state=808
    )
    
    # Train the model
    model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

    # Make predictions
    y_pred = model.predict(X_val_scaled)

    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    return {'loss': rmse, 'status': STATUS_OK}

space = {
    "n_estimators": hp.quniform("n_estimators", 100, 1200, 10),
    "learning_rate": hp.uniform("learning_rate", 0.005, 0.3),
    "max_depth": hp.quniform("max_depth", 3, 20, 1),
    "min_child_weight": hp.uniform("min_child_weight", 1, 10),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "gamma": hp.uniform("gamma", 0, 10),  # Add gamma
    "reg_alpha": hp.uniform("reg_alpha", 0, 1),  # L1 regularization
    "reg_lambda": hp.uniform("reg_lambda", 0, 1),  # L2 regularization
}

# Run Hyperopt optimization
trials = Trials()
best_params2 = fmin(
    fn=objective,         # Objective function
    space=space,          # Hyperparameter space
    algo=tpe.suggest,     # Tree Parzen Estimator (TPE) algorithm
    max_evals=100,         # Number of evaluations
    trials=trials,        # Track results
    rstate=np.random.default_rng(808)  # Ensure reproducibility
)

print("Best Hyperparameters:", best_params2)

100%|██████████| 100/100 [00:48<00:00,  2.06trial/s, best loss: 6.596723741781978]
Best Hyperparameters: {'colsample_bytree': 0.635007691141245, 'gamma': 4.2389745944948025, 'learning_rate': 0.023728382044981844, 'max_depth': 4.0, 'min_child_weight': 1.9666684051961405, 'n_estimators': 1090.0, 'reg_alpha': 0.5101481208932727, 'reg_lambda': 0.4608831317572362, 'subsample': 0.8388988400045299}


In [19]:
# Convert integer-based hyperparameters explicitly
best_params2["n_estimators"] = int(best_params2["n_estimators"])
best_params2["max_depth"] = int(best_params2["max_depth"])

# Initialize best hyperopt model
xgb_hyper2 = XGBRegressor(**best_params2, eval_metric='rmse', random_state=808)

# Fit model
xgb_hyper2.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=0)

In [20]:
# Predict on validation and testing data
y_pred_hyper2 = xgb_hyper2.predict(X_val_scaled)

# Calculate Mean Squared Error (MSE)
hyper_mse2 = mean_squared_error(y_val, y_pred_hyper2)

# Calculate Root Mean Squared Error (RMSE)
hyper_rmse2 = np.sqrt(hyper_mse2)

# Print results
print(f"2nd Hyperopt Validation RMSE: {hyper_rmse2:.4f}, MSE: {hyper_mse2:.4f}")
print(f"Hyperopt Validation RMSE: {hyper_rmse:.4f}, MSE: {hyper_mse:.4f}")
print(f"Randomsearch Validation RMSE: {val_rmse:.4f}, MSE: {val_mse:.4f}")

2nd Hyperopt Validation RMSE: 6.5971, MSE: 43.5216
Hyperopt Validation RMSE: 6.5625, MSE: 43.0659
Randomsearch Validation RMSE: 6.8878, MSE: 47.4414


In [23]:
y_pred_hyper2_test = xgb_hyper2.predict(test_df)
test_df['DIC'] = y_pred_hyper2_test
test_df.head()

Unnamed: 0,id,Lat_Dec,Lon_Dec,NO2uM,NO3uM,NH3uM,R_TEMP,R_Depth,R_Sal,R_DYNHT,R_Nuts,R_Oxy_micromol.Kg,PO4uM,SiO3uM,TA1.x,Salinity1,Temperature_degC,DIC
0,1455,34.321666,-120.811666,0.02,24.0,0.41,9.51,101,189.9,0.258,0.41,138.8383,1.85,25.5,2244.94,33.83,9.52,2271.483398
1,1456,34.275,-120.033333,0.0,25.1,0.0,9.84,102,185.2,0.264,0.0,102.7092,2.06,28.3,2253.27,33.963,9.85,2271.40332
2,1457,34.275,-120.033333,0.0,31.9,0.0,6.6,514,124.1,0.874,0.0,2.174548,3.4,88.1,2316.95,34.241,6.65,2270.233887
3,1458,33.828333,-118.625,0.0,0.0,0.2,19.21,1,408.1,0.004,0.2,258.6743,0.27,2.5,2240.49,33.465,19.21,2208.804443
4,1459,33.828333,-118.625,0.02,19.7,0.0,10.65,100,215.5,0.274,0.0,145.8399,1.64,19.4,2238.3,33.72,10.66,2271.391602


In [24]:
submission = test_df[['id', 'DIC']]
submission.to_csv('submission.csv', index=False)

In [22]:
# Get feature importance
feat_imp_hyper2 = pd.DataFrame({'Feature': X_train_scaled.columns, 'Importance': xgb_hyper2.feature_importances_})

# Sort by importance
feat_imp_hyper2 = feat_imp_hyper2.sort_values(by="Importance", ascending=False)
feat_imp_hyper2

Unnamed: 0,Feature,Importance
13,SiO3uM,0.400848
12,PO4uM,0.330505
11,R_Oxy_micromol.Kg,0.119622
8,R_Sal,0.03911
15,Salinity1,0.030427
6,R_TEMP,0.022144
4,NO3uM,0.021664
14,TA1.x,0.021024
7,R_Depth,0.011688
16,Temperature_degC,0.001184
