In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
from mlxtend.regressor import StackingRegressor

In [2]:
# Load a sample dataset (replace this with your own data)
df=pd.read_csv(r'USED DATA\Volvo 11 B Vsh.csv')

In [3]:
df.corr()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
DEPTH,1.0,0.342475,0.328821,-0.307207,-0.224029,-0.042971,-0.033118,-0.019166,-0.244497
CALI,0.342475,1.0,0.424115,0.243958,0.542193,-0.051308,-0.219168,-0.437132,0.067135
DRHO,0.328821,0.424115,1.0,-0.004932,0.204444,-0.082135,-0.245458,-0.400756,0.150819
NPHI,-0.307207,0.243958,-0.004932,1.0,0.651978,-0.049798,-0.326178,-0.408404,0.555749
PEF,-0.224029,0.542193,0.204444,0.651978,1.0,-0.027295,-0.451424,-0.742793,0.50478
RT,-0.042971,-0.051308,-0.082135,-0.049798,-0.027295,1.0,-0.065635,0.026661,0.035988
KLOGH,-0.033118,-0.219168,-0.245458,-0.326178,-0.451424,-0.065635,1.0,0.546313,-0.59816
PHIF,-0.019166,-0.437132,-0.400756,-0.408404,-0.742793,0.026661,0.546313,1.0,-0.611219
VSH,-0.244497,0.067135,0.150819,0.555749,0.50478,0.035988,-0.59816,-0.611219,1.0


In [6]:
df.head()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
0,3351.6,8.682,0.061,0.277,6.865,2.314,0.001,0.055,0.869
1,3351.7,8.672,0.059,0.283,6.73,2.373,0.001,0.058,0.892
2,3351.8,8.625,0.057,0.285,6.58,2.309,0.001,0.061,0.881
3,3351.9,8.578,0.057,0.28,6.467,2.255,0.001,0.062,0.886
4,3352.0,8.601,0.056,0.267,6.4,2.309,0.001,0.064,0.876


In [5]:
df.tail()

Unnamed: 0,DEPTH,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF,VSH
13906,4744.3,8.641,0.081,0.115,4.329,1.809,0.703,0.098,0.142
13907,4744.4,8.672,0.077,0.118,4.271,1.487,0.662,0.099,0.171
13908,4744.5,8.672,0.07,0.121,4.162,1.208,0.645,0.101,0.205
13909,4744.6,8.671,0.064,0.12,4.087,1.137,0.741,0.103,0.218
13910,4744.7,8.568,0.059,0.125,4.136,1.093,0.916,0.107,0.227


In [4]:
# selecting features and label
X = df.iloc[:, 1:-1]  # features
y = df.iloc[:,-1]  # label

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)

In [5]:
X

Unnamed: 0,CALI,DRHO,NPHI,PEF,RT,KLOGH,PHIF
0,8.682,0.061,0.277,6.865,2.314,0.001,0.055
1,8.672,0.059,0.283,6.730,2.373,0.001,0.058
2,8.625,0.057,0.285,6.580,2.309,0.001,0.061
3,8.578,0.057,0.280,6.467,2.255,0.001,0.062
4,8.601,0.056,0.267,6.400,2.309,0.001,0.064
...,...,...,...,...,...,...,...
13906,8.641,0.081,0.115,4.329,1.809,0.703,0.098
13907,8.672,0.077,0.118,4.271,1.487,0.662,0.099
13908,8.672,0.070,0.121,4.162,1.208,0.645,0.101
13909,8.671,0.064,0.120,4.087,1.137,0.741,0.103


In [7]:
X.shape,y.shape

((13911, 7), (13911,))

In [8]:
# function to convert seconds to into hours, minutes and seconds
# this function is used to measure the time taking by the models
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return "%d:%02d:%02d" % (hour, minutes, seconds)

In [9]:
import time

In [10]:
# Initialize the base models
xgb = XGBRegressor()
gbr = GradientBoostingRegressor()

# Initialize the meta-model
meta_model = LinearRegression()

In [11]:
from mlxtend.regressor import StackingCVRegressor

stacked_model = StackingCVRegressor(regressors=(xgb, gbr),
                            meta_regressor=meta_model,random_state=2,cv=5)

start = time.time()

stacked_model.fit(X_train, y_train)

end = time.time()

print('run_time:', convert(end-start), 'h:m:s')


run_time: 0:00:35 h:m:s


In [12]:
pred_2 = stacked_model.predict(X_test)
pred_1 = stacked_model.predict(X_train)
pred_3=  stacked_model.predict(X)



In [13]:
import os

# Create the output/cnn directory if it doesn't exist
output_dir = "./output/stack_1"
os.makedirs(output_dir, exist_ok=True)

In [15]:
a = np.stack([y_train, pred_1], axis=1) # training
a = pd.DataFrame(a, columns=['y_train', 'y_train_pred'])

b = np.stack([y_test, pred_2], axis=1) # testing 
b = pd.DataFrame(b, columns=['y_test', 'y_test_pred'])

c= np.stack([y, pred_3], axis=1) # testing 
c = pd.DataFrame(c, columns=['y', 'y_pred'])

with pd.ExcelWriter("./output/stack_1/predicted_stack_1_model_GCV.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="training", index=True)
    b.to_excel(writer, sheet_name="testing", index=True)
    c.to_excel(writer, sheet_name="whole_data", index=True)

In [16]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

train_r2 = r2_score(y_train,pred_1) # r2_score for training set
train_mae = mean_absolute_error(y_train, pred_1) # mean absolute error for training set
train_mse = mean_squared_error(y_train, pred_1) # mean squared error for training set
train_mape = mean_absolute_percentage_error(y_train, pred_1) # mean_absolute_percentage_error for training set
train_ev = explained_variance_score(y_train, pred_1) 
train_maxE = max_error(y_train, pred_1)
train_minE = min(abs(y_train - pred_1))

test_r2 = r2_score(y_test, pred_2) # r2_score for testing set
test_mae = mean_absolute_error(y_test, pred_2) # mean absolute error for testing set
test_mse = mean_squared_error(y_test, pred_2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y_test, pred_2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y_test, pred_2) 
test_maxE = max_error(y_test, pred_2)
test_minE = min(abs(y_test - pred_2))

r2 = r2_score(y, pred_3) # r2_score for whole dataset
mae = mean_absolute_error(y, pred_3) # mean absolute error for whole dataset
mse = mean_squared_error(y, pred_3) # mean squared error for whole dataset
mape = mean_absolute_percentage_error(y, pred_3) # mean_absolute_percentage_error for whole dataset
ev = explained_variance_score(y, pred_3) 
maxE = max_error(y, pred_3)
minE = min(abs(y - pred_3))

metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/stack_1/performance_stack_1.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
training,0.999089,0.002747,0.000027,0.013093,0.999089,0.102122,0.0
testing,0.990637,0.005483,0.000293,0.022876,0.990647,0.447455,0.0
whole,0.996469,0.003567,0.000107,0.016029,0.996471,0.447455,0.0


In [17]:
df_11A=pd.read_csv(r'USED DATA\Volvo T2 vsh.csv')
X_T2 = df_11A.iloc[:, 1:-1]  # features
y = df_11A.iloc[:,-1]  # label

predict2=stacked_model.predict(X_T2)



In [18]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y,predict2], axis=1) # training
a = pd.DataFrame(a, columns=['VSH', 'VSH_Pred'])



with pd.ExcelWriter("./output/stack_1/predicted__T2 WHOLE PREDICT.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="T2 WHOLE", index=True)

In [19]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

test_r2 = r2_score(y, predict2) # r2_score for testing set
test_mae = mean_absolute_error(y,predict2) # mean absolute error for testing set
test_mse = mean_squared_error(y, predict2) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y, predict2) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y, predict2) 
test_maxE = max_error(y, predict2)
test_minE = min(abs(y - predict2))


metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
#'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
#'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/stack_1/performance_T2.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
testing,0.910309,0.041247,0.005918,0.131291,0.911049,0.402095,0.000011


In [20]:
df_11A=pd.read_csv(r'USED DATA\Volvo 11 A Vsh.csv')
# selecting features and label
X = df_11A.iloc[:, 1:-1]  # features
y = df_11A.iloc[:,-1]  # label
X.shape, y.shape


predict3=stacked_model.predict(X)



In [21]:
# saving training, testing and y data with their predicted data in a excel file with different sheets
a = np.stack([y,predict3], axis=1) # training
a = pd.DataFrame(a, columns=['vsh','vsh_pred'])

with pd.ExcelWriter("./output/stack_1/predicted_11A_whole.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index
    # to store the dataframe in specified sheet
    a.to_excel(writer, sheet_name="11_WHOLE", index=True)

In [22]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error

test_r2 = r2_score(y, predict3) # r2_score for testing set
test_mae = mean_absolute_error(y,predict3) # mean absolute error for testing set
test_mse = mean_squared_error(y, predict3) # mean squared error for testing set
test_mape = mean_absolute_percentage_error(y, predict3) # mean_absolute_percentage_error for testing set
test_ev = explained_variance_score(y, predict3) 
test_maxE = max_error(y, predict3)
test_minE = min(abs(y - predict3))


metrics = {
'performance_metrics': ['R2', 'MAE', 'MSE', 'MAPE', 'EV', 'maxE', 'minE'],
#'training': [train_r2, train_mae, train_mse, train_mape, train_ev, train_maxE, train_minE],
'testing': [test_r2, test_mae, test_mse, test_mape, test_ev, test_maxE, test_minE],
#'whole': [r2, mae, mse, mape, ev, maxE, minE]
}

performance_metrics = pd.DataFrame(metrics)
performance = performance_metrics.transpose()
performance.to_csv('./output/stack_1/performance_11A.csv')
performance

Unnamed: 0,0,1,2,3,4,5,6
performance_metrics,R2,MAE,MSE,MAPE,EV,maxE,minE
testing,0.85066,0.04301,0.008273,0.143646,0.86214,0.447321,0.000003


In [23]:
#from sklearn.externals import joblib
from joblib import dump, load
dump(stacked_model, './output/stack_1/trained_stack_1_model.joblib')

#clf = load('trained_linear_regression_model.joblib')

['./output/stack_1/trained_stack_1_model.joblib']