In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [2]:
df = pd.read_csv(r'USED DATA\Volvo 11 B Vsh.csv')
df.head()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
0,3351.6,8.682,0.061,0.277,6.865,2.314,0.001,0.055,0.869
1,3351.7,8.672,0.059,0.283,6.73,2.373,0.001,0.058,0.892
2,3351.8,8.625,0.057,0.285,6.58,2.309,0.001,0.061,0.881
3,3351.9,8.578,0.057,0.28,6.467,2.255,0.001,0.062,0.886
4,3352.0,8.601,0.056,0.267,6.4,2.309,0.001,0.064,0.876


In [3]:
# selecting features and label
X = df.iloc[:, 1:-1]  # features
y = df.iloc[:,-1]  # label
X.shape, y.shape

((13911, 7), (13911,))

In [4]:
# function to convert seconds to into hours, minutes and seconds
# this function is used to measure the time taking by the models
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return "%d:%02d:%02d" % (hour, minutes, seconds)

In [7]:
# Loading the svr model from svm and GridSearchCV from model_selection
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import time

regr = GridSearchCV(
    XGBRegressor(),
    {
        'n_estimators': [200,300,400,500],
        'max_depth': [6,8,10,12],
        #grow_policy': [0,1],
        'learning_rate': [0.001, 0.05, 0.01,],
        'early_stopping_rounds': [3,5,],
        'tree_method': ['exact','approx'],
        #'verbosity': [2]
    }, 
    cv=10, 
    scoring=['neg_mean_squared_error', 'r2'], 
    refit='r2',
    verbose=1)

start = time.time()
# enter your code below this line to calculate time it take

#Fit the model
regr.fit(X, y, eval_set=[(X, y)], verbose=False)

# enter you code above this line  to calculate time it take   
end = time.time()

print('gridsearch_run_time:', convert(end-start), 'h:m:s')

#regr.cv_results_

Fitting 10 folds for each of 192 candidates, totalling 1920 fits
gridsearch_run_time: 1:02:28 h:m:s


In [8]:
import os

# Create the output/cnn directory if it doesn't exist
output_dir = "./output/xgboost"
os.makedirs(output_dir, exist_ok=True)

In [9]:
# converting the results to a readable format using dataFrame
df = pd.DataFrame(regr.cv_results_)
# getting all parameter combinations and their performance result
result = df[['param_max_depth', 'param_early_stopping_rounds', 'param_n_estimators',
             'param_learning_rate', 'param_tree_method',
            'mean_test_r2', 'mean_test_neg_mean_squared_error']]
result.to_csv('./output/xgboost/performance_result_for_xgboost_gridsearchCV.csv')
result

Unnamed: 0,param_max_depth,param_early_stopping_rounds,param_n_estimators,param_learning_rate,param_tree_method,mean_test_r2,mean_test_neg_mean_squared_error
0,6,3,200,0.001,exact,0.001074,-0.023062
1,6,3,200,0.001,approx,0.001213,-0.023045
2,6,3,300,0.001,exact,0.163112,-0.019598
3,6,3,300,0.001,approx,0.163390,-0.019575
4,6,3,400,0.001,exact,0.296358,-0.016725
...,...,...,...,...,...,...,...
187,12,5,300,0.01,approx,0.905895,-0.002427
188,12,5,400,0.01,exact,0.903986,-0.002488
189,12,5,400,0.01,approx,0.907586,-0.002298
190,12,5,500,0.01,exact,0.904336,-0.002449


In [10]:
# printing the best combination of parameter that perform the best
print(regr.best_params_)

# the best performing score
print(regr.best_score_)

{'early_stopping_rounds': 3, 'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 500, 'tree_method': 'approx'}
0.9144220422042201


In [12]:
pred_3 = regr.predict(X) # prediction for whole dataset

In [13]:

c = np.stack([y, pred_3], axis=1) # whole data
c = pd.DataFrame(c, columns=['y', 'y_pred'])

with pd.ExcelWriter("./output/xgboost/predicted_xgboost_model.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet

    c.to_excel(writer, sheet_name="whole_data", index=True)

In [14]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error


r2 = r2_score(y, pred_3) # r2_score for whole dataset
mae = mean_absolute_error(y, pred_3) # mean absolute error for whole dataset
mse = mean_squared_error(y, pred_3) # mean squared error for whole dataset
mape = mean_absolute_percentage_error(y, pred_3) # mean_absolute_percentage_error for whole dataset
ev = explained_variance_score(y, pred_3) 
maxE = max_error(y, pred_3)
minE = min(abs(y - pred_3))

metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
#performance.to_csv('./output/xgboost/performance_xgboost.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
whole,0.998025,0.002813,0.00006,0.012823,0.998025,0.206389,0.0


In [15]:
dfA=pd.read_csv(r'USED DATA\Volvo 11 A Vsh.csv')

In [16]:
X = dfA.iloc[:, 1:-1]  # features
y = dfA.iloc[:,-1]  # label
X.shape, y.shape


((1482, 7), (1482,))

In [17]:
X

Unnamed: 0,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF
0,8.648,0.069,0.252,6.823,1.960,0.001,0.087
1,8.648,0.070,0.254,6.810,1.914,0.001,0.088
2,8.672,0.067,0.253,6.825,1.775,0.001,0.090
3,8.672,0.062,0.248,6.841,1.766,0.001,0.092
4,8.672,0.059,0.243,6.857,1.750,0.001,0.092
...,...,...,...,...,...,...,...
1477,8.781,0.075,0.195,6.512,1.013,0.001,0.025
1478,8.781,0.077,0.188,6.500,0.982,0.001,0.028
1479,8.734,0.078,0.188,6.441,0.982,0.001,0.027
1480,8.727,0.076,0.182,6.341,1.020,0.001,0.024


In [18]:
predict2=regr.predict(X)

In [19]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([X.iloc[:,0],y,predict2], axis=1) # training
a = pd.DataFrame(a, columns=['Depth','vsh', 'vsh_pred'])

with pd.ExcelWriter("./output/xgboost/predicted_11A_whole.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="11_WHOLE", index=True)


In [20]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

test_r2 = r2_score(y, predict2) # r2_score for testing set
test_mae = mean_absolute_error(y,predict2) # mean absolute error for testing set
test_mse = mean_squared_error(y, predict2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y, predict2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y, predict2) 
test_maxE = max_error(y, predict2)
test_minE = min(abs(y - predict2))


metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
#'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
#'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/xgboost/performance_11A.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
testing,0.851826,0.042724,0.008208,0.144854,0.864456,0.446187,0.000008


In [25]:
dfA=pd.read_csv(r'USED DATA\Volvo T2 vsh.csv')

X_T2 = dfA.iloc[:, 1:-1]  # features
y = dfA.iloc[:,-1]  # label
X.shape, y.shape

((1482, 7), (1780,))

In [26]:

predict2=regr.predict(X_T2)

In [27]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([X_T2.iloc[:,0],y,predict2], axis=1) # training
a = pd.DataFrame(a, columns=['DEPTH','VSH', 'VSH_Pred'])



with pd.ExcelWriter("./output/xgboost/predicted__T2 WHOLE PREDICT.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="T2 WHOLE", index=True)


In [28]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

test_r2 = r2_score(y, predict2) # r2_score for testing set
test_mae = mean_absolute_error(y,predict2) # mean absolute error for testing set
test_mse = mean_squared_error(y, predict2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y, predict2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y, predict2) 
test_maxE = max_error(y, predict2)
test_minE = min(abs(y - predict2))


metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
#'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
#'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/xgboost/performance_11T2.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
testing,0.895858,0.04429,0.006872,0.139625,0.897321,0.408636,0.000003
