In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv('illinois_basing_train.csv')
df_train.drop('Avg_PLT_CO2InjRate_TPH', axis=1, inplace = True)
df_test = pd.read_csv('illinois_basing_test.csv')

df_train[df_train.columns[-1]].fillna(0, inplace = True)
y = df_train[df_train.columns[-1]]
df_train.drop(df_train.columns[-1], axis=1, inplace = True)


for i in df_train.columns[1:]:
    new_col = 'sin' + i
    df_train[new_col] = np.sin(df_train[i].astype(np.float64))
    df_test[new_col] = np.sin(df_test[i].astype(np.float64))
    
    new_col = 'cos' + i
    df_train[new_col] = np.cos(df_train[i].astype(np.float64))
    df_test[new_col] = np.cos(df_test[i].astype(np.float64))
    
    new_col = 'tan' + i
    df_train[new_col] = np.tan(df_train[i].astype(np.float64))
    df_test[new_col] = np.tan(df_test[i].astype(np.float64))


df_train['Month'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.month
df_train['Day'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.day
df_train['Hour'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.hour
df_train['Year'] = pd.DatetimeIndex(df_train['SampleTimeUTC']).year
df_train.drop('SampleTimeUTC', axis=1, inplace = True)


df_test['Month'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.month
df_test['Day'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.day
df_test['Hour'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.hour
df_test['Year'] = pd.DatetimeIndex(df_test['SampleTimeUTC']).year
df_test.drop('SampleTimeUTC', axis=1, inplace = True)

cols = [i for i in df_train.columns if df_train[i].isnull().any()]
for i in cols:
    df_train[i].fillna(df_train[i].mean(), inplace=True)

cols = [i for i in df_test.columns if df_test[i].isnull().any()]
for i in cols:
    df_test[i].fillna(df_test[i].mean(), inplace=True)

In [3]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()

df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train, y, test_size=0.1, random_state=42
)

In [5]:
params = {
    "n_estimators": 505,
    "max_depth": 6,
    "min_samples_split": 4,
    "learning_rate": 0.07,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_sine_features_xg_boost1.csv", index = False)

The mean squared error (MSE) on test set: 12.6732


In [6]:

# Various hyper-parameters to tune
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb1 = XGBRegressor(tree_method="gpu_hist")
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [0.072, 0.0721, 0.0719, 0.0718], #so called `eta` value
              'max_depth': [5],
              'n_estimators': [507, 508, 509, 506, 505]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        n_jobs = -1,
                        verbose=4)

xgb_grid.fit(df_train, y)

print('Best Score', xgb_grid.best_score_)
print('Best Params', xgb_grid.best_params_)


params = xgb_grid.best_params_

best_model = XGBRegressor(**params)
best_model.fit(df_train, y)



# convert array into dataframe
test_predict = best_model.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_sine_features_xg_boost_grid.csv", index = False)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score -2095.4634173947256
Best Params {'learning_rate': 0.0721, 'max_depth': 5, 'n_estimators': 507, 'nthread': 4, 'objective': 'reg:squarederror'}


In [7]:
params = {
    "n_estimators": 507,
    "max_depth": 5,
    "min_samples_split": 4,
    "learning_rate": 0.0721,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# convert array into dataframe
test_predict = reg.predict(df_test)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])

# save the dataframe as a csv file
preds.to_csv("pred_with_sine_features_xg_boost2.csv", index = False)

The mean squared error (MSE) on test set: 12.5490
