In [1]:
import pandas as pd
import numpy as np
import os, pickle
np.random.seed(42)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler    

In [2]:
filename = 'finalized_model.pkl'

df_train = pd.read_csv('illinois_basing_train.csv')
df_train.drop('Avg_PLT_CO2InjRate_TPH', axis=1, inplace = True)
df_test = pd.read_csv('illinois_basing_test.csv')

df_train[df_train.columns[-1]].fillna(0, inplace = True)
y = df_train[df_train.columns[-1]]
df_train.drop(df_train.columns[-1], axis=1, inplace = True)

df_train['Month'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.month
df_train['Day'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.day
df_train['Hour'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.hour
df_train['Year'] = pd.DatetimeIndex(df_train['SampleTimeUTC']).year
df_train.drop('SampleTimeUTC', axis=1, inplace = True)


df_test['Month'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.month
df_test['Day'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.day
df_test['Hour'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.hour
df_test['Year'] = pd.DatetimeIndex(df_test['SampleTimeUTC']).year
df_test.drop('SampleTimeUTC', axis=1, inplace = True)

cols = [i for i in df_train.columns if df_train[i].isnull().any()]
for i in cols:
    df_train[i].fillna(df_train[i].mean(), inplace=True)

cols = [i for i in df_test.columns if df_test[i].isnull().any()]
for i in cols:
    df_test[i].fillna(df_test[i].mean(), inplace=True)

X_train, X_val, y_train, y_val = train_test_split(
    df_train, y, test_size=0.1, random_state=13
)

    

if not os.path.exists('finalized_model.pkl'): 
    # The best now - fifth
    params = {
        "n_estimators": 505,
        "max_depth": 5,
        "min_samples_split": 4,
        "learning_rate": 0.072,
        "loss": "squared_error",
    }
    reg = ensemble.GradientBoostingRegressor(**params)
    reg.fit(X_train, y_train)
    mse = mean_squared_error(y_val, reg.predict(X_val))
    print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

    # convert array into dataframe
    test_predict = reg.predict(df_test)
    preds = pd.DataFrame(test_predict, columns=['inj_diff'])

    # save the dataframe as a csv file
    preds.to_csv("pred_with_time_stamp_features_xg_boost11.csv", index = False)
    
    pickle.dump(reg, open(filename, 'wb'))
else:
    loaded_model = pickle.load(open(filename, 'rb'))
    mse = mean_squared_error(y_val, loaded_model.predict(X_val))
    # result = loaded_model.score(X_val, y_val)
    print(mse)

9.981433488063146
