In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('illinois_basing_train.csv')
df_train.drop('Avg_PLT_CO2InjRate_TPH', axis=1, inplace = True)
df_test = pd.read_csv('illinois_basing_test.csv')

df_train[df_train.columns[-1]].fillna(0, inplace = True)
y = df_train[df_train.columns[-1]]
df_train.drop(df_train.columns[-1], axis=1, inplace = True)

df_train['Month'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.month
df_train['Day'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.day
df_train['Hour'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.hour
df_train['Year'] = pd.DatetimeIndex(df_train['SampleTimeUTC']).year
df_train.drop('SampleTimeUTC', axis=1, inplace = True)


df_test['Month'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.month
df_test['Day'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.day
df_test['Hour'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.hour
df_test['Year'] = pd.DatetimeIndex(df_test['SampleTimeUTC']).year
df_test.drop('SampleTimeUTC', axis=1, inplace = True)

cols = [i for i in df_train.columns if df_train[i].isnull().any()]
for i in cols:
    df_train[i].fillna(df_train[i].mean(), inplace=True)

cols = [i for i in df_test.columns if df_test[i].isnull().any()]
for i in cols:
    df_test[i].fillna(df_test[i].mean(), inplace=True)
    
X_train, X_val, y_train, y_val = train_test_split(
    df_train, y, test_size=0.1, random_state=13
)

In [3]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost.csv", index = False)

The mean squared error (MSE) on test set: 12.5087


In [4]:
params = {
    "n_estimators": 10,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost1.csv", index = False)

The mean squared error (MSE) on test set: 13.5357


In [5]:
params = {
    "n_estimators": 1000,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.001,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost2.csv", index = False)

The mean squared error (MSE) on test set: 13.0744


In [7]:
# best till now - first result
params = {
    "n_estimators": 500,
    "max_depth": 5,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost3.csv", index = False)

The mean squared error (MSE) on test set: 11.6790


In [8]:
params = {
    "n_estimators": 500,
    "max_depth": 7,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost4.csv", index = False)

The mean squared error (MSE) on test set: 9.6820


In [9]:
params = {
    "n_estimators": 500,
    "max_depth": 7,
    "min_samples_split": 6,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost5.csv", index = False)

The mean squared error (MSE) on test set: 9.6501


In [10]:
# the best one now - second
params = {
    "n_estimators": 500,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.05,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost6.csv", index = False)

The mean squared error (MSE) on test set: 10.1321


In [11]:
# The best now - third
params = {
    "n_estimators": 500,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.07,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost7.csv", index = False)

The mean squared error (MSE) on test set: 10.0390


In [5]:
# The best now -fourht
params = {
    "n_estimators": 500,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.072,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost8.csv", index = False)

The mean squared error (MSE) on test set: 9.9803


In [6]:

params = {
    "n_estimators": 520,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.072,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost9.csv", index = False)

The mean squared error (MSE) on test set: 10.0644


In [7]:

params = {
    "n_estimators": 450,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.072,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost10.csv", index = False)

The mean squared error (MSE) on test set: 10.0508


In [8]:

params = {
    "n_estimators": 490,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.072,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost10.csv", index = False)

The mean squared error (MSE) on test set: 10.0857


In [16]:

# Various hyper-parameters to tune
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV


xgb1 = XGBRegressor(tree_method="gpu_hist")
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [0.072, 0.0721, 0.0719, 0.0718], #so called `eta` value
              'max_depth': [5],
              'n_estimators': [507, 508, 509, 506, 505]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        n_jobs = -1,
                        verbose=4)

xgb_grid.fit(df_train, y)


print('Best Score', xgb_grid.best_score_)
print('Best Params', xgb_grid.best_params_)


params = xgb_grid.best_params_

best_model = XGBRegressor(**params)
best_model.fit(df_train, y)



# convert array into dataframe
test_predict = best_model.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost12.csv", index = False)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score -116.9790190527627
Best Params {'learning_rate': 0.0718, 'max_depth': 5, 'n_estimators': 505, 'nthread': 4, 'objective': 'reg:squarederror'}


In [17]:
# 
params = {
    "n_estimators": 505,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.0718,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost13.csv", index = False)

The mean squared error (MSE) on test set: 10.2640


In [24]:
params = {
    "n_estimators": 506,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.072,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost14.csv", index = False)

The mean squared error (MSE) on test set: 9.9588


In [26]:
params = {
    "n_estimators": 506,
    "max_depth": 3,
    "min_samples_split": 5,
    "learning_rate": 0.072,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost15.csv", index = False)

The mean squared error (MSE) on test set: 10.6699


# Cross fold validation

In [7]:
# Various hyper-parameters to tune
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV


xgb1 = XGBRegressor(tree_method="gpu_hist")
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.03, 0.05, .07, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [100, 200, 300, 500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 4,
                        n_jobs = -1,
                        verbose=True)

xgb_grid.fit(df_train, y)

Fitting 4 folds for each of 132 candidates, totalling 528 fits


 -5.59242860e+01 -7.73492874e+01 -8.98603793e+01 -1.07763633e+02
 -5.17658058e+01 -7.68737549e+01 -9.15831236e+01 -1.13138524e+02
 -5.82898480e+01 -7.12841072e+01 -8.50413569e+01 -8.82259310e+01
 -6.56088445e+01 -7.91331824e+01 -9.14787273e+01 -1.04443603e+02
 -7.50817555e+01 -9.46368046e+01 -1.11409329e+02 -1.35501923e+02
 -9.61183258e+01 -1.01358167e+02 -1.08640454e+02 -1.08343465e+02
 -9.78856436e+01 -1.20628666e+02 -1.19665908e+02 -1.22844784e+02
 -9.69775585e+01 -1.16377859e+02 -1.27092535e+02 -1.26307184e+02
 -2.33497051e-01 -6.31780566e-01 -1.15420669e+00 -2.64779692e+00
 -2.28430269e-01 -6.21312097e-01 -1.13916206e+00 -2.64497809e+00
 -2.24205216e-01 -6.13933994e-01 -1.12857129e+00 -2.62891650e+00
 -1.22209923e+01 -2.44221720e+01 -3.60877125e+01 -5.67907297e+01
 -1.21925330e+01 -2.46488558e+01 -3.65291751e+01 -5.79549385e+01
 -1.21147782e+01 -2.47443964e+01 -3.70275938e+01 -5.87634466e+01
 -1.41189736e+02 -1.64439860e+02 -1.71057336e+02 -1.77346736e+02
 -1.05489243e+02 -1.19229

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=4,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_bin=None,
                                    max_cat...
                                    predictor=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None, ...),
             n_jobs=5,
             param_grid={'colsample_bytree': [0.7],
                         'learning_

In [8]:
print('Best Score', xgb_grid.best_score_)
print('Best Params', xgb_grid.best_params_)

Best Score -0.22420521594915105
Best Params {'colsample_bytree': 0.7, 'learning_rate': 0.001, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 4, 'objective': 'reg:squarederror', 'silent': 1, 'subsample': 0.7}


In [11]:
best_model = XGBRegressor(params = xgb_grid.best_params_, tree_method="gpu_hist")
best_model.fit(df_train, y)

Parameters: { "params" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0...depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1,
             params={'colsample_bytree': 0.7, 'learning_rate': 0.001,
                     'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 100,
                     'nthread': 4, 'objective': 'reg:squarederror', 'silent': 1,
                     'subsample': 0.7},
             predictor='auto', random_state=0, reg_alpha=0, ...)

In [12]:
# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

In [13]:
# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost_grid.csv", index = False)

# Best Model - XGBoost after regerous attemps

In [3]:
# The best now - fifth
params = {
    "n_estimators": 505,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.072,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_time_stamp_features_xg_boost11.csv", index = False)

The mean squared error (MSE) on test set: 9.8513
