In [None]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler
from pickle import load, dump
import numpy as np

In [None]:
# Load the data
data = pd.read_csv('encoded_data.csv')
X = data.iloc[:, :-1]
y = data['OilPeakRate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
# Create the XGBoost model
model = XGBRegressor(n_estimators=5000,
                     max_depth=15,
                     learning_rate=0.001,
                     objective='reg:squarederror',
                     tree_method = "hist",
                     enable_categorical = True,
                     sampling_method = "gradient_based",
                     booster = 'dart',
                     device = "cuda:0",
                     grow_policy = 'lossguide',)



In [None]:
# Train the model
model.fit(X_train, y_train)


In [None]:

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)

RMSE: 93.65137189933439


In [None]:
model.save_model("xgboost_5000_20.json")


In [None]:
results = pd.DataFrame( data= np.concatenate([y_test[:, np.newaxis], y_pred[:, np.newaxis]], axis = 1), columns = ['labels', 'predictions' ])
results.to_csv('results.csv')

  results = pd.DataFrame( data= np.concatenate([y_test[:, np.newaxis], y_pred[:, np.newaxis]], axis = 1), columns = ['labels', 'predictions' ])


In [None]:
#Load model and re-test
model = XGBRegressor(n_estimators=5000,
                     max_depth=10)
model.load_model("xgboost_5000_10.json")

In [None]:
#Load scaler
with open('scaler.pkl', 'rb') as f:
  scaler = load(f)

In [None]:
# Make predictions on the test set
X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test)

# Evaluate the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)

RMSE: 93.65137189933439


In [None]:
print(ttest_ind(y_pred, y_test))

TtestResult(statistic=1.624881476343796, pvalue=0.10430540251779621, df=2680.0)
