In [5]:
# Load basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stats
import time

# XGB libraries
from sklearn.model_selection import train_test_split,RandomizedSearchCV, cross_val_score, KFold
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.preprocessing import StandardScaler


# Import data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sample_sub =  pd.read_csv("data/sample_submission.csv")

# Remove NA column from training data
train_df = train_df.drop(columns='Unnamed: 12')

# Fix column name error
test_df = test_df.rename(columns={'TA1':'TA1.x'})

In [6]:
# Assign features
X = train_df.drop(columns=['id', 'DIC'], axis=1)
y = train_df['DIC']
X_test = test_df.drop(columns=['id'], axis=1) 

# Scale the data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# For predictions later on...
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [7]:
# Set up kfold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=808)

# Define objective function to minimize
def objective(params):
    model = XGBRegressor(
        n_estimators=int(params["n_estimators"]),
        learning_rate=params["learning_rate"],
        max_depth=int(params["max_depth"]),
        min_child_weight=params["min_child_weight"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        gamma=params["gamma"],
        reg_alpha=params["reg_alpha"],
        reg_lambda=params["reg_lambda"],
        random_state=808
    )
    
    # Perform cross-validation
    scores = -cross_val_score(model, X_scaled, y, cv=kf, scoring='neg_root_mean_squared_error', n_jobs=16)

    # Average RMSE across folds
    rmse = np.mean(scores)

    return {'loss': rmse, 'status': STATUS_OK}

# Create hyperparameter space
space = {
    "n_estimators": hp.quniform("n_estimators", 100, 1200, 10),
    "learning_rate": hp.uniform("learning_rate", 0.005, 0.3),
    "max_depth": hp.quniform("max_depth", 3, 20, 1),
    "min_child_weight": hp.uniform("min_child_weight", 1, 10),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "gamma": hp.uniform("gamma", 0, 10),  
    "reg_alpha": hp.uniform("reg_alpha", 0, 1),  
    "reg_lambda": hp.uniform("reg_lambda", 0, 1),  
}

# Run hyperopt
trials = Trials()
best_params = fmin(
    fn=objective, 
    space=space,      
    algo=tpe.suggest,   
    max_evals=200, 
    trials=trials,       
    rstate=np.random.default_rng(808)  
)

# Print results
print("Best Hyperparameters:", best_params)

100%|██████████| 200/200 [01:43<00:00,  1.94trial/s, best loss: 5.40228838698805] 
Best Hyperparameters: {'colsample_bytree': 0.6623148649907094, 'gamma': 8.552518467838766, 'learning_rate': 0.022651348393097286, 'max_depth': 5.0, 'min_child_weight': 2.82902252729899, 'n_estimators': 1180.0, 'reg_alpha': 0.365663425099092, 'reg_lambda': 0.37667782140175327, 'subsample': 0.5007440077849364}


In [8]:
# Convert int hyperparameters to fix type error
best_params["n_estimators"] = int(best_params["n_estimators"])
best_params["max_depth"] = int(best_params["max_depth"])

# Initialize best hyperopt model
xgb_hyper = XGBRegressor(**best_params, eval_metric='rmse', random_state=808)

# Fit model
xgb_hyper.fit(X_scaled, y)

# Predict on test data
y_pred_hyper = xgb_hyper.predict(X_test_scaled)

In [9]:
# Get feature importance
feat_imp_hyper = pd.DataFrame({'Feature': X_scaled.columns, 'Importance': xgb_hyper.feature_importances_})

# Sort by importance
feat_imp_hyper = feat_imp_hyper.sort_values(by="Importance", ascending=False)
feat_imp_hyper

Unnamed: 0,Feature,Importance
12,SiO3uM,0.420729
11,PO4uM,0.331014
10,R_Oxy_micromol.Kg,0.109896
7,R_Sal,0.056064
3,NO3uM,0.04492
13,TA1.x,0.021423
6,R_Depth,0.008314
14,Salinity1,0.003545
2,NO2uM,0.001313
15,Temperature_degC,0.00074


In [10]:
# Add DIC to test dataset
test_df['DIC'] = y_pred_hyper
submission = test_df[['id', 'DIC']]
submission.head()

Unnamed: 0,id,DIC
0,1455,2169.143799
1,1456,2194.339355
2,1457,2325.040283
3,1458,1992.654907
4,1459,2150.697021


In [11]:
# Export for submission
# submission.to_csv('submission.csv', index=False)