In [1]:
import torch
import scipy.io as io
import plotly.offline as py
import plotly.graph_objs as go
import json
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics

import pickle 

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import mean_squared_error
import math as m

py.init_notebook_mode(connected=True)


In [2]:
# Loading Matlab Struct array with all of the data:

mat = io.loadmat('data_Mg_GBperatom_seg_2Al_dump.mat')

length_A = mat['A'].shape[1]

In [3]:
# Organizing data:

for i in range(30):
    segE = mat['A']['Eseg'][0,i]
    #check whether this is a valid data?
    n1 = segE[:,0] != 0 
    segE = np.squeeze(segE[n1,:])
    atom_ID = segE[:,0].astype(int) - 1

    descriptor = mat['A']['peratom'][0,i][0,0]
    descriptor_temp = np.concatenate([descriptor['pos'],descriptor['pe'],descriptor['cna'],descriptor['centro_fnn'],
                                descriptor['centro_snn'],descriptor['coord'],descriptor['f'],descriptor['stress'],
                                descriptor['voronoi']], axis = 1)
    if i == 0:
        descriptor_all = descriptor_temp[atom_ID]
        segE_all = segE
    else:
        descriptor_temp = descriptor_temp[atom_ID]
        descriptor_all = np.concatenate([descriptor_all, descriptor_temp], axis = 0)
        segE_all = np.concatenate([segE_all, segE])

descriptor_all[:,2] = abs(descriptor_all[:,2]-min(descriptor_all[:,2])-20)
sigma_H = np.sum(descriptor_all[:,11:14], axis = 1)/3
f_mag = np.linalg.norm(descriptor_all[:,8:11], axis = 1, ord = 2)

feature = np.concatenate([descriptor_all, sigma_H[:,np.newaxis], f_mag[:,np.newaxis]], axis = 1)

# XGBOOST

In [4]:
#Extreme Gradient Boosted Decision Trees (XGBOOST)  **MODEL B**

y_true = segE_all[:,1]
pos = feature[:,:3]
zpos = pos[:,2][:,np.newaxis]
ypos= pos[:,1][:,np.newaxis]
xpos= pos[:,0][:,np.newaxis]

feature1 = feature[:,3:]
# Separation of each feature:
f0= feature1[:,0][:,np.newaxis]
f1= feature1[:,1][:,np.newaxis]
f2= feature1[:,2][:,np.newaxis]
f3= feature1[:,3][:,np.newaxis]
f4= feature1[:,4][:,np.newaxis]
f5= feature1[:,5][:,np.newaxis]
f6= feature1[:,6][:,np.newaxis]
f7= feature1[:,7][:,np.newaxis]
f8= feature1[:,8][:,np.newaxis]
f9= feature1[:,9][:,np.newaxis]
f10= feature1[:,10][:,np.newaxis]
f11= feature1[:,11][:,np.newaxis]
f12= feature1[:,12][:,np.newaxis]
f13= feature1[:,13][:,np.newaxis]
f14= feature1[:,14][:,np.newaxis]
f15= feature1[:,15][:,np.newaxis]
f16= feature1[:,16][:,np.newaxis]
f17= feature1[:,17][:,np.newaxis]

# From Feature Elimination:
feature1 = np.concatenate([f0,f1,f2,f3,f4,f8,f9,f10,f11,f13,f14,f15,f16,xpos,ypos], axis=1)



n= feature1.shape[1]
print(n)

feature1 = feature1/np.max(feature1)

#Adding Interaction Terms:
feature2 = np.zeros([feature1.shape[0], n*n])  
for i in range(n):
    feature2[:,i*n:i*n+n] = feature1[:,i][:,np.newaxis]*feature1[:,:]

feature_space = np.concatenate([feature1, feature2], axis = 1)
feature_space = np.concatenate([np.ones((feature1.shape[0],1)), feature_space], axis = 1)

#feature_space = feature1

# print(feature_space.shape)   #
np.random.seed(10)
idx0 = np.random.permutation(np.arange(len(feature_space)))
feature_space = feature_space[idx0]
pos = pos[idx0]
y_true = y_true[idx0]





X_fold1 = feature_space[0:int(len(feature_space)*.2)]
X_fold2 = feature_space[int(len(feature_space)*.2):int(len(feature_space)*.4)]
X_fold3 = feature_space[int(len(feature_space)*.4):int(len(feature_space)*.6)]
X_fold4 = feature_space[int(len(feature_space)*.6):int(len(feature_space)*.8)]
X_fold5 = feature_space[int(len(feature_space)*.8):]

X_std1 = np.std(X_fold1, axis = 0)
X_std2 = np.std(X_fold2, axis = 0)
X_std3 = np.std(X_fold3, axis = 0)
X_std4 = np.std(X_fold4, axis = 0)
X_std5 = np.std(X_fold5, axis = 0)

X_mean1 = np.mean(X_fold1, axis = 0)
X_mean2 = np.mean(X_fold2, axis = 0)
X_mean3 = np.mean(X_fold3, axis = 0)
X_mean4 = np.mean(X_fold4, axis = 0)
X_mean5 = np.mean(X_fold5, axis = 0)

# X_fold1 = (X_fold1 - X_mean1)/X_std1
# X_fold2 = (X_fold2 - X_mean2)/X_std2
# X_fold3 = (X_fold3 - X_mean3)/X_std3
# X_fold4 = (X_fold4 - X_mean4)/X_std4
# X_fold5 = (X_fold5 - X_mean5)/X_std5


y_fold1 = y_true[0:int(len(feature_space)*.2)]
y_fold2 = y_true[int(len(feature_space)*.2):int(len(feature_space)*.4)]
y_fold3 = y_true[int(len(feature_space)*.4):int(len(feature_space)*.6)]
y_fold4 = y_true[int(len(feature_space)*.6):int(len(feature_space)*.8)]
y_fold5 = y_true[int(len(feature_space)*.8):]



print(X_fold1.shape)
print(X_fold2.shape)
print(X_fold3.shape)
print(X_fold4.shape)
print(X_fold5.shape,'\n')

print(y_fold1.shape)
print(y_fold2.shape)
print(y_fold3.shape)
print(y_fold4.shape)
print(y_fold5.shape, '\n')



X_train1 = np.concatenate([X_fold1,X_fold2,X_fold3,X_fold4], axis=0)
X_test1 = X_fold5
y_train1 = np.concatenate([y_fold1,y_fold2,y_fold3,y_fold4], axis=0)
y_test1 = y_fold5

X_train2 = np.concatenate([X_fold1,X_fold2,X_fold3,X_fold5], axis=0)
X_test2= X_fold4
y_train2 = np.concatenate([y_fold1,y_fold2,y_fold3,y_fold5], axis=0)
y_test2 = y_fold4

X_train3 = np.concatenate([X_fold1,X_fold2,X_fold4,X_fold5], axis=0)
X_test3 = X_fold3
y_train3 = np.concatenate([y_fold1,y_fold2,y_fold4,y_fold5], axis=0)
y_test3 = y_fold3

X_train4 = np.concatenate([X_fold1,X_fold3,X_fold4,X_fold5], axis=0)
X_test4 = X_fold2
y_train4 = np.concatenate([y_fold1,y_fold3,y_fold4,y_fold5], axis=0)
y_test4 = y_fold2

X_train5 = np.concatenate([X_fold2,X_fold3,X_fold4,X_fold5], axis=0)
X_test5 = X_fold1
y_train5 = np.concatenate([y_fold2,y_fold3,y_fold4,y_fold5], axis=0)
y_test5 = y_fold1



15
(2846, 241)
(2846, 241)
(2846, 241)
(2846, 241)
(2847, 241) 

(2846,)
(2846,)
(2846,)
(2846,)
(2847,) 



In [10]:
R_squared_train_xgb_cv_list = []
R_squared_val_xgb_cv_list = []


In [11]:
rmse_xgb_list = []

In [12]:
X_trainCV = X_train1 
Y_trainCV = y_train1
X_testCV = X_test1
Y_testCV = y_test1

xgboostbcv1= xgb.XGBRegressor(learning_rate= 0.07729127514917492, objective='reg:squarederror',
                         booster='gbtree', max_depth=19,
                         reg_lambda = 0.25, reg_alpha= 1e-5,
                         subsample= 0.55 , colsample_bytree=0.8,
                         min_child_weight= 0.4, max_delta_step= 0.6)

# Training/Fitting:
modelxgbbcv1= xgboostbcv1.fit(X_trainCV, Y_trainCV)



In [13]:
R_squared_train = modelxgbbcv1.score(X_trainCV, Y_trainCV)
R_squared_val = modelxgbbcv1.score(X_testCV, Y_testCV)
print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)

R_squared_train_xgb_cv_list.append(R_squared_train)
R_squared_val_xgb_cv_list.append(R_squared_val)

R_squared train:  0.9994362074950945
R_squared val:  0.973051104585609


In [14]:
X_trainCV = X_train1 
Y_trainCV = y_train1
X_testCV = X_test1
Y_testCV = y_test1

y_test_pred_xgb = modelxgbbcv1.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xgb))
print('RMSE:',rmse)

rmse_xgb_list.append(rmse)


RMSE: 0.007835020271543959


In [9]:
# # Saving Model

# modelxgbbcv1_pkl_filename = 'modelxgbbcv1.pkl'
# modelxgbbcv1_pkl = open(modelxgbbcv1_pkl_filename, 'wb')
# pickle.dump(modelxgbbcv1, modelxgbbcv1_pkl)
# modelxgbbcv1_pkl.close()

### Round 2 XGB:

In [15]:
X_trainCV = X_train2 
Y_trainCV = y_train2
X_testCV = X_test2
Y_testCV = y_test2

xgboostbcv2= xgb.XGBRegressor(learning_rate= 0.07729127514917492, objective='reg:squarederror',
                         booster='gbtree', max_depth=19,
                         reg_lambda = 0.25, reg_alpha= 1e-5,
                         subsample= 0.55 , colsample_bytree=0.8,
                         min_child_weight= 0.4, max_delta_step= 0.6)

# Training/Fitting:
modelxgbbcv2= xgboostbcv2.fit(X_trainCV, Y_trainCV)



In [16]:
R_squared_train = modelxgbbcv2.score(X_trainCV, Y_trainCV)
R_squared_val = modelxgbbcv2.score(X_testCV, Y_testCV)
print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)

R_squared_train_xgb_cv_list.append(R_squared_train)
R_squared_val_xgb_cv_list.append(R_squared_val)

R_squared train:  0.9994382541553422
R_squared val:  0.9732455805041803


In [17]:
X_trainCV = X_train2 
Y_trainCV = y_train2
X_testCV = X_test2
Y_testCV = y_test2

y_test_pred_xgb = modelxgbbcv2.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xgb))
print('RMSE:',rmse)

rmse_xgb_list.append(rmse)


RMSE: 0.007292880510033242


In [18]:
# # Saving Model

# modelxgbbcv2_pkl_filename = 'modelxgbbcv2.pkl'
# modelxgbbcv2_pkl = open(modelxgbbcv2_pkl_filename, 'wb')
# pickle.dump(modelxgbbcv2, modelxgbbcv2_pkl)
# modelxgbbcv2_pkl.close()

### Round 3 XGB:

In [19]:
X_trainCV = X_train3 
Y_trainCV = y_train3
X_testCV = X_test3
Y_testCV = y_test3

xgboostbcv3= xgb.XGBRegressor(learning_rate= 0.07729127514917492, objective='reg:squarederror',
                         booster='gbtree', max_depth=19,
                         reg_lambda = 0.25, reg_alpha= 1e-5,
                         subsample= 0.55 , colsample_bytree=0.8,
                         min_child_weight= 0.4, max_delta_step= 0.6)

# Training/Fitting:
modelxgbbcv3= xgboostbcv3.fit(X_trainCV, Y_trainCV)



In [20]:
R_squared_train = modelxgbbcv3.score(X_trainCV, Y_trainCV)
R_squared_val = modelxgbbcv3.score(X_testCV, Y_testCV)
print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)

R_squared_train_xgb_cv_list.append(R_squared_train)
R_squared_val_xgb_cv_list.append(R_squared_val)

R_squared train:  0.9994133662531399
R_squared val:  0.9701867577567458


In [21]:
X_trainCV = X_train3 
Y_trainCV = y_train3
X_testCV = X_test3
Y_testCV = y_test3

y_test_pred_xgb = modelxgbbcv3.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xgb))
print('RMSE:',rmse)

rmse_xgb_list.append(rmse)


RMSE: 0.007732840386569825


In [22]:
# # Saving Model

# modelxgbbcv3_pkl_filename = 'modelxgbbcv3.pkl'
# modelxgbbcv3_pkl = open(modelxgbbcv3_pkl_filename, 'wb')
# pickle.dump(modelxgbbcv3, modelxgbbcv3_pkl)
# modelxgbbcv3_pkl.close()

### Round 4 XGB:

In [23]:
X_trainCV = X_train4 
Y_trainCV = y_train4
X_testCV = X_test4
Y_testCV = y_test4

xgboostbcv4= xgb.XGBRegressor(learning_rate= 0.07729127514917492, objective='reg:squarederror',
                         booster='gbtree', max_depth=19,
                         reg_lambda = 0.25, reg_alpha= 1e-5,
                         subsample= 0.55 , colsample_bytree=0.8,
                         min_child_weight= 0.4, max_delta_step= 0.6)

# Training/Fitting:
modelxgbbcv4= xgboostbcv4.fit(X_trainCV, Y_trainCV)



In [24]:
R_squared_train = modelxgbbcv4.score(X_trainCV, Y_trainCV)
R_squared_val = modelxgbbcv4.score(X_testCV, Y_testCV)
print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)

R_squared_train_xgb_cv_list.append(R_squared_train)
R_squared_val_xgb_cv_list.append(R_squared_val)

R_squared train:  0.9994678308994275
R_squared val:  0.9717185928353148


In [25]:
X_trainCV = X_train4 
Y_trainCV = y_train4
X_testCV = X_test4
Y_testCV = y_test4

y_test_pred_xgb = modelxgbbcv4.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xgb))
print('RMSE:',rmse)

rmse_xgb_list.append(rmse)


RMSE: 0.007416170414426282


In [26]:
# # Saving Model

# modelxgbbcv4_pkl_filename = 'modelxgbbcv4.pkl'
# modelxgbbcv4_pkl = open(modelxgbbcv4_pkl_filename, 'wb')
# pickle.dump(modelxgbbcv4, modelxgbbcv4_pkl)
# modelxgbbcv4_pkl.close()

### Round 5 XGB:

In [27]:
X_trainCV = X_train5 
Y_trainCV = y_train5
X_testCV = X_test5
Y_testCV = y_test5

xgboostbcv5= xgb.XGBRegressor(learning_rate= 0.07729127514917492, objective='reg:squarederror',
                         booster='gbtree', max_depth=19,
                         reg_lambda = 0.25, reg_alpha= 1e-5,
                         subsample= 0.55 , colsample_bytree=0.8,
                         min_child_weight= 0.4, max_delta_step= 0.6)

# Training/Fitting:
modelxgbbcv5= xgboostbcv5.fit(X_trainCV, Y_trainCV)



In [28]:
R_squared_train = modelxgbbcv5.score(X_trainCV, Y_trainCV)
R_squared_val = modelxgbbcv5.score(X_testCV, Y_testCV)
print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)

R_squared_train_xgb_cv_list.append(R_squared_train)
R_squared_val_xgb_cv_list.append(R_squared_val)

R_squared train:  0.9993801839025819
R_squared val:  0.9717256871587105


In [29]:
X_trainCV = X_train5 
Y_trainCV = y_train5
X_testCV = X_test5
Y_testCV = y_test5

y_test_pred_xgb = modelxgbbcv5.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xgb))
print('RMSE:',rmse)

rmse_xgb_list.append(rmse)


RMSE: 0.0077563271788740834


In [30]:
# # Saving Model

# modelxgbbcv5_pkl_filename = 'modelxgbbcv5.pkl'
# modelxgbbcv5_pkl = open(modelxgbbcv5_pkl_filename, 'wb')
# pickle.dump(modelxgbbcv5, modelxgbbcv5_pkl)
# modelxgbbcv5_pkl.close()

## XGB Results:

In [31]:
print(R_squared_train_xgb_cv_list)
print(R_squared_val_xgb_cv_list)
print(rmse_xgb_list)

[0.9994362074950945, 0.9994382541553422, 0.9994133662531399, 0.9994678308994275, 0.9993801839025819]
[0.973051104585609, 0.9732455805041803, 0.9701867577567458, 0.9717185928353148, 0.9717256871587105]
[0.007835020271543959, 0.007292880510033242, 0.007732840386569825, 0.007416170414426282, 0.0077563271788740834]


In [67]:
rmse = rmse_xgb_list
print(np.std(rmse))
print(np.std(R_squared_val_xgb_cv_list))

0.0002122328993311271
0.0011043126192053514


In [33]:
print(np.mean(rmse))

0.007606647752289479


# Extra Trees

In [34]:
#EXTRA-TREES:

y_true = segE_all[:,1]
pos = feature[:,:3]
zpos = pos[:,2][:,np.newaxis]
ypos= pos[:,1][:,np.newaxis]
xpos= pos[:,0][:,np.newaxis]

feature1 = feature[:,3:]
# Separation of each feature:
f0= feature1[:,0][:,np.newaxis]
f1= feature1[:,1][:,np.newaxis]
f2= feature1[:,2][:,np.newaxis]
f3= feature1[:,3][:,np.newaxis]
f4= feature1[:,4][:,np.newaxis]
f5= feature1[:,5][:,np.newaxis]
f6= feature1[:,6][:,np.newaxis]
f7= feature1[:,7][:,np.newaxis]
f8= feature1[:,8][:,np.newaxis]
f9= feature1[:,9][:,np.newaxis]
f10= feature1[:,10][:,np.newaxis]
f11= feature1[:,11][:,np.newaxis]
f12= feature1[:,12][:,np.newaxis]
f13= feature1[:,13][:,np.newaxis]
f14= feature1[:,14][:,np.newaxis]
f15= feature1[:,15][:,np.newaxis]
f16= feature1[:,16][:,np.newaxis]
f17= feature1[:,17][:,np.newaxis]

feature1 = np.concatenate([f0,f1,f2,f3,f4,f8,f9,f10,f11,f13,f14,f15,f16,xpos,ypos], axis=1)


n= feature1.shape[1]

#Adding Interaction Terms
feature2 = np.zeros([feature1.shape[0], n*n])  
for i in range(n):
    feature2[:,i*n:i*n+n] = feature1[:,i][:,np.newaxis]*feature1[:,:]

feature_space = np.concatenate([feature1, feature2], axis = 1)
feature_space = np.concatenate([np.ones((feature1.shape[0],1)), feature_space], axis = 1)

np.random.seed(10)
idx0 = np.random.permutation(np.arange(len(feature_space)))
feature_space = feature_space[idx0]
pos = pos[idx0]
y_true = y_true[idx0]




X_fold1 = feature_space[0:int(len(feature_space)*.2)]
X_fold2 = feature_space[int(len(feature_space)*.2):int(len(feature_space)*.4)]
X_fold3 = feature_space[int(len(feature_space)*.4):int(len(feature_space)*.6)]
X_fold4 = feature_space[int(len(feature_space)*.6):int(len(feature_space)*.8)]
X_fold5 = feature_space[int(len(feature_space)*.8):]

X_std1 = np.std(X_fold1, axis = 0)
X_std2 = np.std(X_fold2, axis = 0)
X_std3 = np.std(X_fold3, axis = 0)
X_std4 = np.std(X_fold4, axis = 0)
X_std5 = np.std(X_fold5, axis = 0)

X_mean1 = np.mean(X_fold1, axis = 0)
X_mean2 = np.mean(X_fold2, axis = 0)
X_mean3 = np.mean(X_fold3, axis = 0)
X_mean4 = np.mean(X_fold4, axis = 0)
X_mean5 = np.mean(X_fold5, axis = 0)

# X_fold1 = (X_fold1 - X_mean1)/X_std1
# X_fold2 = (X_fold2 - X_mean2)/X_std2
# X_fold3 = (X_fold3 - X_mean3)/X_std3
# X_fold4 = (X_fold4 - X_mean4)/X_std4
# X_fold5 = (X_fold5 - X_mean5)/X_std5


y_fold1 = y_true[0:int(len(feature_space)*.2)]
y_fold2 = y_true[int(len(feature_space)*.2):int(len(feature_space)*.4)]
y_fold3 = y_true[int(len(feature_space)*.4):int(len(feature_space)*.6)]
y_fold4 = y_true[int(len(feature_space)*.6):int(len(feature_space)*.8)]
y_fold5 = y_true[int(len(feature_space)*.8):]



print(X_fold1.shape)
print(X_fold2.shape)
print(X_fold3.shape)
print(X_fold4.shape)
print(X_fold5.shape,'\n')

print(y_fold1.shape)
print(y_fold2.shape)
print(y_fold3.shape)
print(y_fold4.shape)
print(y_fold5.shape, '\n')



X_train1 = np.concatenate([X_fold1,X_fold2,X_fold3,X_fold4], axis=0)
X_test1 = X_fold5
y_train1 = np.concatenate([y_fold1,y_fold2,y_fold3,y_fold4], axis=0)
y_test1 = y_fold5

X_train2 = np.concatenate([X_fold1,X_fold2,X_fold3,X_fold5], axis=0)
X_test2= X_fold4
y_train2 = np.concatenate([y_fold1,y_fold2,y_fold3,y_fold5], axis=0)
y_test2 = y_fold4

X_train3 = np.concatenate([X_fold1,X_fold2,X_fold4,X_fold5], axis=0)
X_test3 = X_fold3
y_train3 = np.concatenate([y_fold1,y_fold2,y_fold4,y_fold5], axis=0)
y_test3 = y_fold3

X_train4 = np.concatenate([X_fold1,X_fold3,X_fold4,X_fold5], axis=0)
X_test4 = X_fold2
y_train4 = np.concatenate([y_fold1,y_fold3,y_fold4,y_fold5], axis=0)
y_test4 = y_fold2

X_train5 = np.concatenate([X_fold2,X_fold3,X_fold4,X_fold5], axis=0)
X_test5 = X_fold1
y_train5 = np.concatenate([y_fold2,y_fold3,y_fold4,y_fold5], axis=0)
y_test5 = y_fold1



(2846, 241)
(2846, 241)
(2846, 241)
(2846, 241)
(2847, 241) 

(2846,)
(2846,)
(2846,)
(2846,)
(2847,) 



In [35]:
R_squared_train_xtree_cv_list = []
R_squared_val_xtree_cv_list = []
rmse_xtree_list = []

In [36]:
X_trainCV = X_train1 
Y_trainCV = y_train1
X_testCV = X_test1
Y_testCV = y_test1



xtreebcv1 = ExtraTreesRegressor(n_estimators = 330, criterion='mse', max_features = 1.0,min_samples_split=2 )

modelxtreebcv1= xtreebcv1.fit(X_trainCV, Y_trainCV)



In [37]:
R_squared_train = modelxtreebcv1.score(X_trainCV, Y_trainCV)
R_squared_val = modelxtreebcv1.score(X_testCV, Y_testCV)

y_test_pred_xtree = modelxtreebcv1.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xtree))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_xtree_cv_list.append(R_squared_train)
R_squared_val_xtree_cv_list.append(R_squared_val)


rmse_xtree_list.append(rmse)

R_squared train:  0.9999979576066643
R_squared val:  0.9731556342007376
RMSE: 0.007819810234863282


In [38]:
# # Saving Model

# modelxtreebcv1_pkl_filename = 'modelxtreebcv1.pkl'
# modelxtreebcv1_pkl = open(modelxtreebcv1_pkl_filename, 'wb')
# pickle.dump(modelxtreebcv1, modelxtreebcv1_pkl)
# modelxtreebcv1_pkl.close()

### Round 2 Xtree

In [39]:
X_trainCV = X_train2 
Y_trainCV = y_train2
X_testCV = X_test2
Y_testCV = y_test2



xtreebcv2 = ExtraTreesRegressor(n_estimators = 330, criterion='mse', max_features = 1.0,min_samples_split=2 )

modelxtreebcv2= xtreebcv2.fit(X_trainCV, Y_trainCV)



In [40]:
R_squared_train = modelxtreebcv2.score(X_trainCV, Y_trainCV)
R_squared_val = modelxtreebcv2.score(X_testCV, Y_testCV)

y_test_pred_xtree = modelxtreebcv2.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xtree))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_xtree_cv_list.append(R_squared_train)
R_squared_val_xtree_cv_list.append(R_squared_val)


rmse_xtree_list.append(rmse)

R_squared train:  0.9999980232565081
R_squared val:  0.9732858395502967
RMSE: 0.007287391417899801


In [33]:
# # Saving Model

# modelxtreebcv2_pkl_filename = 'modelxtreebcv2.pkl'
# modelxtreebcv2_pkl = open(modelxtreebcv2_pkl_filename, 'wb')
# pickle.dump(modelxtreebcv2, modelxtreebcv2_pkl)
# modelxtreebcv2_pkl.close()

### Round 3 Xtree

In [41]:
X_trainCV = X_train3 
Y_trainCV = y_train3
X_testCV = X_test3
Y_testCV = y_test3



xtreebcv3 = ExtraTreesRegressor(n_estimators = 330, criterion='mse', max_features = 1.0,min_samples_split=2 )

modelxtreebcv3= xtreebcv3.fit(X_trainCV, Y_trainCV)



In [42]:
R_squared_train = modelxtreebcv3.score(X_trainCV, Y_trainCV)
R_squared_val = modelxtreebcv3.score(X_testCV, Y_testCV)

y_test_pred_xtree = modelxtreebcv3.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xtree))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_xtree_cv_list.append(R_squared_train)
R_squared_val_xtree_cv_list.append(R_squared_val)


rmse_xtree_list.append(rmse)

R_squared train:  0.9999980394803554
R_squared val:  0.9694402076802744
RMSE: 0.007829060349801934


In [43]:
# # Saving Model

# modelxtreebcv3_pkl_filename = 'modelxtreebcv3.pkl'
# modelxtreebcv3_pkl = open(modelxtreebcv3_pkl_filename, 'wb')
# pickle.dump(modelxtreebcv3, modelxtreebcv3_pkl)
# modelxtreebcv3_pkl.close()

### Round 4 Xtree

In [44]:
X_trainCV = X_train4 
Y_trainCV = y_train4
X_testCV = X_test4
Y_testCV = y_test4



xtreebcv4 = ExtraTreesRegressor(n_estimators = 330, criterion='mse', max_features = 1.0,min_samples_split=2 )

modelxtreebcv4= xtreebcv4.fit(X_trainCV, Y_trainCV)



In [45]:
R_squared_train = modelxtreebcv4.score(X_trainCV, Y_trainCV)
R_squared_val = modelxtreebcv4.score(X_testCV, Y_testCV)

y_test_pred_xtree = modelxtreebcv4.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xtree))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_xtree_cv_list.append(R_squared_train)
R_squared_val_xtree_cv_list.append(R_squared_val)


rmse_xtree_list.append(rmse)

R_squared train:  0.9999980832660245
R_squared val:  0.972441164239135
RMSE: 0.007320818282651853


In [46]:
# # Saving Model

# modelxtreebcv4_pkl_filename = 'modelxtreebcv4.pkl'
# modelxtreebcv4_pkl = open(modelxtreebcv4_pkl_filename, 'wb')
# pickle.dump(modelxtreebcv4, modelxtreebcv4_pkl)
# modelxtreebcv4_pkl.close()

### Round 5 Xtree

In [47]:
X_trainCV = X_train5 
Y_trainCV = y_train5
X_testCV = X_test5
Y_testCV = y_test5



xtreebcv5 = ExtraTreesRegressor(n_estimators = 330, criterion='mse', max_features = 1.0,min_samples_split=2 )

modelxtreebcv5= xtreebcv5.fit(X_trainCV, Y_trainCV)



In [48]:
R_squared_train = modelxtreebcv5.score(X_trainCV, Y_trainCV)
R_squared_val = modelxtreebcv5.score(X_testCV, Y_testCV)

y_test_pred_xtree = modelxtreebcv5.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_xtree))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_xtree_cv_list.append(R_squared_train)
R_squared_val_xtree_cv_list.append(R_squared_val)


rmse_xtree_list.append(rmse)

R_squared train:  0.9999980435857009
R_squared val:  0.970974189612624
RMSE: 0.00785872818413047


In [42]:
# # Saving Model

# modelxtreebcv5_pkl_filename = 'modelxtreebcv5.pkl'
# modelxtreebcv5_pkl = open(modelxtreebcv5_pkl_filename, 'wb')
# pickle.dump(modelxtreebcv5, modelxtreebcv5_pkl)
# modelxtreebcv5_pkl.close()

### XTREE Results

In [43]:
print(R_squared_train_xtree_cv_list)
print(R_squared_val_xtree_cv_list)
print(rmse_xtree_list)

[1.0, 1.0, 1.0, 1.0, 1.0]
[0.9732088416575441, 0.9734740206525311, 0.969061142741724, 0.9728166221412193, 0.971146530520792]
[0.0078120566787868865, 0.007261678969888768, 0.007877466699391034, 0.0072707783379404286, 0.007835362828692714]


In [49]:
rmse = rmse_xtree_list
print(np.std(rmse))
print(np.mean(rmse))
print(np.std(R_squared_val_xtree_cv_list))
print(np.mean(R_squared_val_xtree_cv_list))

0.00026104010919976586
0.007623161693869468
0.0014621553565361898
0.9718594070566136


# Random Forests (RF)

In [50]:
#Random Forest:
y_true = segE_all[:,1]
pos = feature[:,:3]
zpos = pos[:,2][:,np.newaxis]
ypos= pos[:,1][:,np.newaxis]
xpos= pos[:,0][:,np.newaxis]
feature1 = feature[:,3:]

# Separation of each feature:
f0= feature1[:,0][:,np.newaxis]
f1= feature1[:,1][:,np.newaxis]
f2= feature1[:,2][:,np.newaxis]
f3= feature1[:,3][:,np.newaxis]
f4= feature1[:,4][:,np.newaxis]
f5= feature1[:,5][:,np.newaxis]
f6= feature1[:,6][:,np.newaxis]
f7= feature1[:,7][:,np.newaxis]
f8= feature1[:,8][:,np.newaxis]
f9= feature1[:,9][:,np.newaxis]
f10= feature1[:,10][:,np.newaxis]
f11= feature1[:,11][:,np.newaxis]
f12= feature1[:,12][:,np.newaxis]
f13= feature1[:,13][:,np.newaxis]
f14= feature1[:,14][:,np.newaxis]
f15= feature1[:,15][:,np.newaxis]
f16= feature1[:,16][:,np.newaxis]
f17= feature1[:,17][:,np.newaxis]

feature1 = np.concatenate([f0,f1,f2,f3,f4,f8,f9,f10,f11,f13,f14,f15,f16,xpos,ypos], axis=1)


# Adding Interaction Term:
n= feature1.shape[1]
#feature1 = feature1/np.max(feature1)   

#Adding Interaction Term:
feature2 = np.zeros([feature1.shape[0], n*n])  
for i in range(n):
    feature2[:,i*n:i*n+n] = feature1[:,i][:,np.newaxis]*feature1[:,:]

feature_space = np.concatenate([feature1, feature2], axis = 1)
feature_space = np.concatenate([np.ones((feature1.shape[0],1)), feature_space], axis = 1)


np.random.seed(10)
idx0 = np.random.permutation(np.arange(len(feature_space)))
feature_space = feature_space[idx0]
pos = pos[idx0]
y_true = y_true[idx0]





X_fold1 = feature_space[0:int(len(feature_space)*.2)]
X_fold2 = feature_space[int(len(feature_space)*.2):int(len(feature_space)*.4)]
X_fold3 = feature_space[int(len(feature_space)*.4):int(len(feature_space)*.6)]
X_fold4 = feature_space[int(len(feature_space)*.6):int(len(feature_space)*.8)]
X_fold5 = feature_space[int(len(feature_space)*.8):]

X_std1 = np.std(X_fold1, axis = 0)
X_std2 = np.std(X_fold2, axis = 0)
X_std3 = np.std(X_fold3, axis = 0)
X_std4 = np.std(X_fold4, axis = 0)
X_std5 = np.std(X_fold5, axis = 0)

X_mean1 = np.mean(X_fold1, axis = 0)
X_mean2 = np.mean(X_fold2, axis = 0)
X_mean3 = np.mean(X_fold3, axis = 0)
X_mean4 = np.mean(X_fold4, axis = 0)
X_mean5 = np.mean(X_fold5, axis = 0)

# X_fold1 = (X_fold1 - X_mean1)/X_std1
# X_fold2 = (X_fold2 - X_mean2)/X_std2
# X_fold3 = (X_fold3 - X_mean3)/X_std3
# X_fold4 = (X_fold4 - X_mean4)/X_std4
# X_fold5 = (X_fold5 - X_mean5)/X_std5


y_fold1 = y_true[0:int(len(feature_space)*.2)]
y_fold2 = y_true[int(len(feature_space)*.2):int(len(feature_space)*.4)]
y_fold3 = y_true[int(len(feature_space)*.4):int(len(feature_space)*.6)]
y_fold4 = y_true[int(len(feature_space)*.6):int(len(feature_space)*.8)]
y_fold5 = y_true[int(len(feature_space)*.8):]



print(X_fold1.shape)
print(X_fold2.shape)
print(X_fold3.shape)
print(X_fold4.shape)
print(X_fold5.shape,'\n')

print(y_fold1.shape)
print(y_fold2.shape)
print(y_fold3.shape)
print(y_fold4.shape)
print(y_fold5.shape, '\n')



X_train1 = np.concatenate([X_fold1,X_fold2,X_fold3,X_fold4], axis=0)
X_test1 = X_fold5
y_train1 = np.concatenate([y_fold1,y_fold2,y_fold3,y_fold4], axis=0)
y_test1 = y_fold5

X_train2 = np.concatenate([X_fold1,X_fold2,X_fold3,X_fold5], axis=0)
X_test2= X_fold4
y_train2 = np.concatenate([y_fold1,y_fold2,y_fold3,y_fold5], axis=0)
y_test2 = y_fold4

X_train3 = np.concatenate([X_fold1,X_fold2,X_fold4,X_fold5], axis=0)
X_test3 = X_fold3
y_train3 = np.concatenate([y_fold1,y_fold2,y_fold4,y_fold5], axis=0)
y_test3 = y_fold3

X_train4 = np.concatenate([X_fold1,X_fold3,X_fold4,X_fold5], axis=0)
X_test4 = X_fold2
y_train4 = np.concatenate([y_fold1,y_fold3,y_fold4,y_fold5], axis=0)
y_test4 = y_fold2

X_train5 = np.concatenate([X_fold2,X_fold3,X_fold4,X_fold5], axis=0)
X_test5 = X_fold1
y_train5 = np.concatenate([y_fold2,y_fold3,y_fold4,y_fold5], axis=0)
y_test5 = y_fold1



(2846, 241)
(2846, 241)
(2846, 241)
(2846, 241)
(2847, 241) 

(2846,)
(2846,)
(2846,)
(2846,)
(2847,) 



In [51]:
R_squared_train_rf_cv_list = []
R_squared_val_rf_cv_list = []
rmse_rf_list = []

In [52]:
X_trainCV = X_train1 
Y_trainCV = y_train1
X_testCV = X_test1
Y_testCV = y_test1



rfbcv1 = RandomForestRegressor(n_estimators= 60, criterion='mse',
                          min_samples_split=2, min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0, max_features = 0.48)

modelrfbcv1= rfbcv1.fit(X_trainCV, Y_trainCV)



In [53]:
R_squared_train = modelrfbcv1.score(X_trainCV, Y_trainCV)
R_squared_val = modelrfbcv1.score(X_testCV, Y_testCV)

y_test_pred_rf = modelrfbcv1.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_rf))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_rf_cv_list.append(R_squared_train)
R_squared_val_rf_cv_list.append(R_squared_val)


rmse_rf_list.append(rmse)

R_squared train:  0.9949537210172115
R_squared val:  0.9676771891684466
RMSE: 0.008580729278890722


In [54]:
# # Saving Model

# modelrfbcv1_pkl_filename = 'modelrfbcv1.pkl'
# modelrfbcv1_pkl = open(modelrfbcv1_pkl_filename, 'wb')
# pickle.dump(modelrfbcv1, modelrfbcv1_pkl)
# modelrfbcv1_pkl.close()

### Round 2 RF

In [55]:
X_trainCV = X_train2 
Y_trainCV = y_train2
X_testCV = X_test2
Y_testCV = y_test2



rfbcv2 = RandomForestRegressor(n_estimators= 60, criterion='mse',
                          min_samples_split=2, min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0, max_features = 0.48)

modelrfbcv2= rfbcv2.fit(X_trainCV, Y_trainCV)



In [56]:
R_squared_train = modelrfbcv2.score(X_trainCV, Y_trainCV)
R_squared_val = modelrfbcv2.score(X_testCV, Y_testCV)

y_test_pred_rf = modelrfbcv2.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_rf))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_rf_cv_list.append(R_squared_train)
R_squared_val_rf_cv_list.append(R_squared_val)


rmse_rf_list.append(rmse)

R_squared train:  0.9950086397933595
R_squared val:  0.9652251064668169
RMSE: 0.00831446359906025


In [50]:
# # Saving Model

# modelrfbcv2_pkl_filename = 'modelrfbcv2.pkl'
# modelrfbcv2_pkl = open(modelrfbcv2_pkl_filename, 'wb')
# pickle.dump(modelrfbcv2, modelrfbcv2_pkl)
# modelrfbcv2_pkl.close()

### Round 3 RF

In [57]:
X_trainCV = X_train3 
Y_trainCV = y_train3
X_testCV = X_test3
Y_testCV = y_test3



rfbcv3 = RandomForestRegressor(n_estimators= 60, criterion='mse',
                          min_samples_split=2, min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0, max_features = 0.48)
modelrfbcv3= rfbcv3.fit(X_trainCV, Y_trainCV)



In [58]:
R_squared_train = modelrfbcv3.score(X_trainCV, Y_trainCV)
R_squared_val = modelrfbcv3.score(X_testCV, Y_testCV)

y_test_pred_rf = modelrfbcv3.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_rf))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_rf_cv_list.append(R_squared_train)
R_squared_val_rf_cv_list.append(R_squared_val)


rmse_rf_list.append(rmse)

R_squared train:  0.9951752412183953
R_squared val:  0.9632506138162356
RMSE: 0.00858537928794307


In [53]:
# # Saving Model

# modelrfbcv3_pkl_filename = 'modelrfbcv3.pkl'
# modelrfbcv3_pkl = open(modelrfbcv3_pkl_filename, 'wb')
# pickle.dump(modelrfbcv3, modelrfbcv3_pkl)
# modelrfbcv3_pkl.close()

### Round 4 RF

In [59]:
X_trainCV = X_train4 
Y_trainCV = y_train4
X_testCV = X_test4
Y_testCV = y_test4



rfbcv4 = RandomForestRegressor(n_estimators= 60, criterion='mse',
                          min_samples_split=2, min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0, max_features = 0.48)

modelrfbcv4= rfbcv4.fit(X_trainCV, Y_trainCV)



In [60]:
R_squared_train = modelrfbcv4.score(X_trainCV, Y_trainCV)
R_squared_val = modelrfbcv4.score(X_testCV, Y_testCV)

y_test_pred_rf = modelrfbcv4.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_rf))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_rf_cv_list.append(R_squared_train)
R_squared_val_rf_cv_list.append(R_squared_val)


rmse_rf_list.append(rmse)

R_squared train:  0.9951070050914994
R_squared val:  0.9665657432661077
RMSE: 0.008063526733344501


In [61]:
# # Saving Model

# modelrfbcv4_pkl_filename = 'modelrfbcv4.pkl'
# modelrfbcv4_pkl = open(modelrfbcv4_pkl_filename, 'wb')
# pickle.dump(modelrfbcv4, modelrfbcv4_pkl)
# modelrfbcv4_pkl.close()

### Round 5 RF

In [62]:
X_trainCV = X_train5 
Y_trainCV = y_train5
X_testCV = X_test5
Y_testCV = y_test5



rfbcv5 = RandomForestRegressor(n_estimators= 60, criterion='mse',
                          min_samples_split=2, min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0, max_features = 0.48)

modelrfbcv5= rfbcv5.fit(X_trainCV, Y_trainCV)



In [63]:
R_squared_train = modelrfbcv5.score(X_trainCV, Y_trainCV)
R_squared_val = modelrfbcv5.score(X_testCV, Y_testCV)

y_test_pred_rf = modelrfbcv5.predict(X_testCV)
rmse = np.sqrt(mean_squared_error(Y_testCV,y_test_pred_rf))

print('R_squared train: ',R_squared_train )
print('R_squared val: ', R_squared_val)
print('RMSE:', rmse)


R_squared_train_rf_cv_list.append(R_squared_train)
R_squared_val_rf_cv_list.append(R_squared_val)


rmse_rf_list.append(rmse)

R_squared train:  0.9950745851108085
R_squared val:  0.9651086450135405
RMSE: 0.008616264224899355


In [64]:
# # Saving Model

# modelrfbcv5_pkl_filename = 'modelrfbcv5.pkl'
# modelrfbcv5_pkl = open(modelrfbcv5_pkl_filename, 'wb')
# pickle.dump(modelrfbcv5, modelrfbcv5_pkl)
# modelrfbcv5_pkl.close()

### RF Results

In [65]:
print(R_squared_train_rf_cv_list)
print(R_squared_val_rf_cv_list)
print(rmse_rf_list)

[0.9949537210172115, 0.9950086397933595, 0.9951752412183953, 0.9951070050914994, 0.9950745851108085]
[0.9676771891684466, 0.9652251064668169, 0.9632506138162356, 0.9665657432661077, 0.9651086450135405]
[0.008580729278890722, 0.00831446359906025, 0.00858537928794307, 0.008063526733344501, 0.008616264224899355]


In [66]:
rmse = rmse_rf_list
print(np.std(rmse))
print(np.mean(rmse))
print(np.std(R_squared_val_rf_cv_list))
print(np.mean(R_squared_val_rf_cv_list))

0.00021409635044103534
0.008432072624827579
0.0014928496245245238
0.9655654595462293
