In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [2]:
df = pd.read_csv(r'Volvo 11 B Vsh.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,ROP,RT,KLOGH,PHIF,VSH
0,0,3351.6,8.682,0.061,0.277,6.865,25.117,2.314,0.001,0.055,0.869
1,1,3351.7,8.672,0.059,0.283,6.73,25.121,2.373,0.001,0.058,0.892
2,2,3351.8,8.625,0.057,0.285,6.58,25.125,2.309,0.001,0.061,0.881
3,3,3351.9,8.578,0.057,0.28,6.467,25.129,2.255,0.001,0.062,0.886
4,4,3352.0,8.601,0.056,0.267,6.4,25.11,2.309,0.001,0.064,0.876


In [3]:
# selecting features and label
X = df.iloc[:, 1:-1]  # features
y = df.iloc[:,-1]  # label
X.shape, y.shape

((13911, 9), (13911,))

In [4]:
# spliting into training and testing set for both X and y
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=27)


In [5]:
# function to convert seconds to into hours, minutes and seconds
# this function is used to measure the time taking by the models
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return "%d:%02d:%02d" % (hour, minutes, seconds)

In [6]:
import os

# Create the output/cnn directory if it doesn't exist
output_dir = "./output/gradient_boosting_B"
os.makedirs(output_dir, exist_ok=True)

In [7]:
# Loading the svr model from svm and GridSearchCV from model_selection
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import time

regr = GridSearchCV(
    GradientBoostingRegressor(), 
    {
        'loss' : ['squared_error','absolute_error',],
        'learning_rate': [0.3,0.5,0.7],
        'n_estimators': [5,7,10,14],
        'criterion': ['squared_error', 'friedman_mse'],
        'max_depth': [6,8,10,12]
        
    }, 
    cv=5, 
    scoring=['neg_mean_squared_error', 'r2'], 
    refit='r2',
    verbose=2)

start = time.time()
# enter your code below this line to calculate time it take

#Fit the model
regr.fit(X_train, y_train)

# enter you code above this line  to calculate time it take   
end = time.time()

print('run_time:', convert(end-start), 'h:m:s')

#regr.cv_results_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_estimators=5; total time=   0.7s
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_estimators=5; total time=   0.7s
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_estimators=5; total time=   0.7s
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_estimators=5; total time=   0.7s
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_estimators=5; total time=   0.7s
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_estimators=7; total time=   1.1s
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_estimators=7; total time=   1.1s
[CV] END criterion=squared_error, learning_rate=0.3, loss=squared_error, max_depth=6, n_es

In [8]:
# converting the results to a readable format using dataFrame
df = pd.DataFrame(regr.cv_results_)

# getting all parameter combinations and their performance result
result = df[['param_n_estimators', 'param_learning_rate', 'param_loss', 
            'param_criterion', 'param_max_depth','mean_test_r2', 'mean_test_neg_mean_squared_error']]
result.to_csv('./output/gradient_boosting_B/performance_result_for_gradient_boosting_gridsearchCV.csv')
result

Unnamed: 0,param_n_estimators,param_learning_rate,param_loss,param_criterion,param_max_depth,mean_test_r2,mean_test_neg_mean_squared_error
0,5,0.3,squared_error,squared_error,6,0.962311,-0.001129
1,7,0.3,squared_error,squared_error,6,0.986307,-0.000410
2,10,0.3,squared_error,squared_error,6,0.993696,-0.000188
3,14,0.3,squared_error,squared_error,6,0.995292,-0.000139
4,5,0.3,squared_error,squared_error,8,0.965997,-0.001017
...,...,...,...,...,...,...,...
187,14,0.7,absolute_error,friedman_mse,10,0.989985,-0.000299
188,5,0.7,absolute_error,friedman_mse,12,0.984306,-0.000472
189,7,0.7,absolute_error,friedman_mse,12,0.989181,-0.000326
190,10,0.7,absolute_error,friedman_mse,12,0.990096,-0.000298


In [9]:
# printing the best combination of parameter that perform the best
print(regr.best_params_)

# the best performing score
print(regr.best_score_)

{'criterion': 'squared_error', 'learning_rate': 0.3, 'loss': 'squared_error', 'max_depth': 8, 'n_estimators': 14}
0.996133262353499


In [10]:
import time
from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor(n_estimators=regr.best_params_['n_estimators'],
                            learning_rate=regr.best_params_['learning_rate'],
                            loss=regr.best_params_['loss'],
                            max_depth=regr.best_params_['max_depth'],
                            criterion=regr.best_params_['criterion'],
                      
                            verbose=1)

start = time.time() # starting of the time
regressor.fit(X_train, y_train) # fit/train the model

end = time.time() # ending of the time
print('run_time:', convert(end - start), 'h:m:s')

pred_1 = regressor.predict(X_train) # prediction for training set
pred_2 = regressor.predict(X_test) # prediction for testing set
pred_3 = regressor.predict(X) # prediction for whole dataset

      Iter       Train Loss   Remaining Time 
         1           0.0147            5.26s
         2           0.0073            6.77s
         3           0.0036            5.40s
         4           0.0018            4.37s
         5           0.0009            3.65s
         6           0.0005            3.13s
         7           0.0002            2.68s
         8           0.0001            2.22s
         9           0.0001            1.81s
        10           0.0000            1.47s
run_time: 0:00:04 h:m:s


In [11]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y_train, pred_1], axis=1) # training
a = pd.DataFrame(a, columns=['y_train', 'y_train_pred'])

b = np.stack([y_test, pred_2], axis=1) # testing 
b = pd.DataFrame(b, columns=['y_test', 'y_test_pred'])

c = np.stack([y, pred_3], axis=1) # whole data
c = pd.DataFrame(c, columns=['y', 'y_pred'])

with pd.ExcelWriter("./output/gradient_boosting_B/predicted_gradient_boosting_model.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="training", index=True)
    b.to_excel(writer, sheet_name="testing", index=True)
    c.to_excel(writer, sheet_name="whole_data", index=True)

In [27]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

train_r2 = r2_score(y_train,pred_1) # r2_score for training set
train_mae = mean_absolute_error(y_train, pred_1) # mean absolute error for training set
train_mse = mean_squared_error(y_train, pred_1) # mean squared error for training set
train_mape = mean_absolute_percentage_error(y_train, pred_1) # mean_absolute_percentage_error for training set
train_ev = explained_variance_score(y_train, pred_1) 
train_maxE = max_error(y_train, pred_1)
train_minE = min(abs(y_train - pred_1))

test_r2 = r2_score(y_test, pred_2) # r2_score for testing set
test_mae = mean_absolute_error(y_test, pred_2) # mean absolute error for testing set
test_mse = mean_squared_error(y_test, pred_2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y_test, pred_2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y_test, pred_2) 
test_maxE = max_error(y_test, pred_2)
test_minE = min(abs(y_test - pred_2))

r2 = r2_score(y, pred_3) # r2_score for whole dataset
mae = mean_absolute_error(y, pred_3) # mean absolute error for whole dataset
mse = mean_squared_error(y, pred_3) # mean squared error for whole dataset
mape = mean_absolute_percentage_error(y, pred_3) # mean_absolute_percentage_error for whole dataset
ev = explained_variance_score(y, pred_3) 
maxE = max_error(y, pred_3)
minE = min(abs(y - pred_3))

metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/gradient_boosting_B/performance_gradient_boosting.csv')
performance

ValueError: Found input variables with inconsistent numbers of samples: [356, 9737]

In [13]:
#from sklearn.externals import joblib
from joblib import dump, load
dump(regressor, './output/gradient_boosting_B/trained_gradient_boosting_model.joblib')

#clf = load('trained_linear_regression_model.joblib')

['./output/gradient_boosting_B/trained_gradient_boosting_model.joblib']

In [1]:
#from sklearn.externals import joblib
from joblib import dump, load

In [2]:

from joblib import Parallel, delayed 
import joblib 
reg=load(r'C:\Users\absid\Downloads\v shale\output\xgboost\trained_gradient_boosting_model.joblib')

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



In [3]:
df_11A=pd.read_csv('Volvo T2 vsh_B.csv')

<IPython.core.display.Javascript object>

In [24]:
df_11A

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,ROP,RT,KLOGH,PHIF,VSH
0,4335.0,8.703,0.057,0.214,6.779,9.729,2.194,0.001,0.091,0.794
1,4335.1,8.687,0.057,0.216,6.800,10.004,2.261,0.001,0.088,0.785
2,4335.2,8.672,0.052,0.214,6.869,10.279,2.249,0.001,0.089,0.783
3,4335.3,8.672,0.046,0.209,6.956,10.554,2.231,0.001,0.090,0.765
4,4335.4,8.693,0.042,0.202,7.019,11.559,2.231,0.001,0.090,0.763
...,...,...,...,...,...,...,...,...,...,...
1775,4512.5,8.781,0.083,0.185,6.925,14.965,2.299,0.001,0.055,0.814
1776,4512.6,8.781,0.081,0.184,7.029,14.960,2.234,0.002,0.057,0.799
1777,4512.7,8.781,0.077,0.181,7.117,14.956,2.195,0.002,0.060,0.809
1778,4512.8,8.781,0.075,0.177,7.182,14.953,2.114,0.002,0.061,0.818


In [6]:
# selecting features and label
X_T2 = df_11A.iloc[:, :-1]  # features
y = df_11A.iloc[:,-1]  # label



In [7]:



predict2=reg.predict(X_T2)

In [8]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([X_T2.iloc[:,0],y,predict2], axis=1) # training
a = pd.DataFrame(a, columns=['DEPTH','VSH', 'VSH_Pred'])



with pd.ExcelWriter("./output/gradient_boosting_B/predicted__T2 WHOLE PREDICT.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="T2 WHOLE", index=True)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

test_r2 = r2_score(y, predict2) # r2_score for testing set
test_mae = mean_absolute_error(y,predict2) # mean absolute error for testing set
test_mse = mean_squared_error(y, predict2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y, predict2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y, predict2) 
test_maxE = max_error(y, predict2)
test_minE = min(abs(y - predict2))


metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
#'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
#'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/gradient_boosting_B/performance_gradient_boosting_T2.csv')
performance

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
testing,0.881378,0.049379,0.007828,0.152697,0.883312,0.381251,0.000001


In [13]:
df_11A=pd.read_csv('Volvo 11 A Vsh_2.csv')
# selecting features and label
X = df_11A.iloc[:, :-1]  # features
y = df_11A.iloc[:,-1]  # label
X.shape, y.shape


predict3=reg.predict(X)

<IPython.core.display.Javascript object>

In [12]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([X.iloc[:,0],y,predict3], axis=1) # training
a = pd.DataFrame(a, columns=['Depth','vsh', 'vsh_pred'])

with pd.ExcelWriter("./output/gradient_boosting_B/predicted_11A_whole.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="11_WHOLE", index=True)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

test_r2 = r2_score(y, predict3) # r2_score for testing set
test_mae = mean_absolute_error(y,predict3) # mean absolute error for testing set
test_mse = mean_squared_error(y, predict3) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y, predict3) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y, predict3) 
test_maxE = max_error(y, predict3)
test_minE = min(abs(y - predict3))


metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
#'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
#'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/gradient_boosting_B/performance_gradient_boosting_11A.csv')
performance

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
testing,0.833677,0.047304,0.009214,0.151775,0.851098,0.438083,0.0
