Data preprocessing :
<br>
Two data sources will be imported <br>
-"AIS_weather_h_rename_copy.csv" will be used to replace the information for true current direction and true wind direction <br>
-The resulting dataframe will be merged with the dataframe from the "AIS_weather_H_ok2_copy.csv" <br>
-The data of the journey between Ronne and Sassnitz will be omitted <br>
-The threshold for the ships manouvering speed will be 5kt (Abebe) <br>

In [None]:
import pandas as pd
from matplotlib import pyplot
import numpy as np
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime
import pickle
import joblib

# Change font to latex

plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
})

dfmain = pd.read_csv("AIS_weather_H_ok2_copy.csv",parse_dates=["Time"])
dfmain = dfmain[dfmain['LAT'] > 55.04 ]

dfpre = pd.read_csv("AIS_weather_h_rename_copy.csv",parse_dates=["Time"])
dfpre = dfpre[dfpre['LAT'] > 55.04 ]



In [None]:
dfpre.info()

In [None]:
dfpre = dfpre.drop(dfpre.columns[[0,1,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,22,23,26]],axis=1)
dfpre = dfpre.rename({'Eastward wind': 'eastwind', 'Northward wind': 'northwind',
                           'Eastward current': 'eastcurrent', 'Northward current': 'northcurrent',
                           'SOG':'sog','COG':'cog','Heading':'heading'},axis=1) 
dfpre.head(n=5)

In [None]:
# Imputing the information for the missing east and north current data

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=50)
imputer.fit(dfpre)

In [None]:
x_pre = imputer.transform(dfpre)

In [None]:
dfpre = pd.DataFrame(x_pre, columns=dfpre.columns, index=dfpre.index)

In [None]:
dfpre["Current Speed"] = np.sqrt(dfpre["eastcurrent"]**2 + dfpre["northcurrent"]**2)
dfpre["Wind Speed"] = np.sqrt(dfpre["eastwind"]**2 + dfpre["northwind"]**2)

In [None]:
# Calculate the True North wind direction

condwind = [(dfpre['eastwind']>0) & (dfpre['northwind']<0),
            (dfpre['eastwind']<0) & (dfpre['northwind']>0),
            (dfpre['eastwind']>0) & (dfpre['northwind']>0)]

choicewind = [360 - np.abs(np.rad2deg(np.arctan(dfpre["eastwind"]/dfpre["northwind"]))),
              180 - np.abs(np.rad2deg(np.arctan(dfpre["eastwind"]/dfpre["northwind"]))),
              270 - np.abs(np.rad2deg(np.arctan(dfpre["eastwind"]/dfpre["northwind"])))]

dfpre["True North Wind Direction"] = np.select(condwind, choicewind, 
                                                np.abs(np.rad2deg(np.arctan(dfpre["eastwind"]/dfpre["northwind"]))))

In [None]:
# Calculate the True North current direction

condcurr = [(dfpre['eastcurrent']<0) & (dfpre['northcurrent']>0),
            (dfpre['eastcurrent']>0) & (dfpre['northcurrent']<0),
            (dfpre['eastcurrent']<0) & (dfpre['northcurrent']<0)]

choicecurr = [360 - np.abs(np.rad2deg(np.arctan(dfpre["eastcurrent"]/dfpre["northcurrent"]))),
              180 - np.abs(np.rad2deg(np.arctan(dfpre["eastcurrent"]/dfpre["northcurrent"]))),
              270 - np.abs(np.rad2deg(np.arctan(dfpre["eastcurrent"]/dfpre["northcurrent"])))]

dfpre["True North Current Direction"] = np.select(condcurr, choicecurr, 
                                                np.abs(np.rad2deg(np.arctan(dfpre["eastcurrent"]/dfpre["northcurrent"]))))

In [None]:
dfpre.describe()

In [None]:
dfpre = dfpre.drop(['eastwind','northwind','eastcurrent','northcurrent','sog','cog'],axis=1)

In [None]:
# Drop the wind speed and current speed from the original dataset as it contains outlier 

dfmain = dfmain.drop(['Wind Speed','Current Speed'],axis=1)

In [None]:
df = pd.concat([dfmain,dfpre],axis=1)
df.head()

In [None]:
# SOG threshold according to Abebe
df = df[df['SOG'] > 5 ]

In [None]:
# Exclude data for the month of June to check the model's forecasting performance 
dfdate6 = df[df['Time'].dt.strftime('%Y-%m') == '2021-06']
df = df[df['Time'].dt.strftime('%Y-%m') != '2021-06']

In [None]:
# Temporary export
# df.to_csv("AIS_sog_threshold.csv")

In [None]:
# Temporary export
# df.to_csv("AIS_impute_check.csv")

Data Analysis before modelling 

In [None]:
df_ship = df.drop(['Unnamed: 0','Time','LON','LAT','Air density above oceans',
                    'Surface pressure','Width','Length'],axis=1)
df_ship.head(n=5)

In [None]:
df_ship2 = df_ship.rename({'Max wave height': 'waveheight', 'Draught': 'draught',
                           'SOG': 'sog', 'Wind Speed': 'windspeed', 
                           'True Wind Direction': 'truewinddir','Temperature above oceans' : 'oceantemperature',
                           'COG': 'cog', 'Current Speed' : 'curspeed','True Wave Direction' : 'truewavedir',
                            'Swell period': 'swellperiod','Wind wave period': 'windwaveperiod','Sea surface temperature': 'surftemp',
                            'Combined wind waves and swell height': 'windwaveswellheight','Swell height': 'swellheight','Wind wave height': 'windwaveheight',
                            'Heading': 'heading','True Current Direction': 'truecurrentdir','True Swell Direction': 'trueswelldir',
                            'True Wind Wave Direction': 'truewindwavedir','Wave period': 'waveperiod',
                            'True North Wind Direction' : 'truenorthwinddir' , 'True North Current Direction' : 'truenorthcurrentdir'
                           }, axis=1) 

In [None]:
df_ship2 = df_ship2[['oceantemperature','waveheight','swellperiod','windwaveperiod','waveperiod','surftemp','windwaveswellheight','swellheight','windwaveheight','draught','sog','cog','heading','windspeed','curspeed','truewinddir','truecurrentdir','trueswelldir','truewindwavedir','truewavedir', 'truenorthwinddir' , 'truenorthcurrentdir']]

In [None]:
# # Scaling script if necessary

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# df_ship2 = pd.DataFrame(scaler.fit_transform(df_ship2),columns=df_ship2.columns)


In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update(plt.rcParamsDefault) # Alter default value 
df_ship2.hist(bins=50,figsize=(20,25))

plt.show()

Feature Selection

In [None]:
corr_matrix = df_ship2.corr()
print(corr_matrix["sog"])

In [None]:
dfship_corr = df_ship2.drop(df_ship2.columns[[0,5,15,16,17,18,19,20,21]],axis=1)
dfship_corr.head()

In [None]:
def correlation_heatmap(df,fontsize):
    correlations = df.corr()

    fig, ax = plt.subplots(figsize=(10,10))
    sns.set(font_scale = fontsize)
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
    plt.show();
    
correlation_heatmap(df_ship2,0.5)

Scientific Justification of HCF:

-According to Bitner Gregersen, the wind wave swell height is equal to the square root of the square of swell wave and wind wave. Therefore SWH and WWH is to be dropped.<br>
-According to Mori, through wave spectrum, the maximum wave height can be approximated, therefore max wave height can be dropped as well <br>
-According to Torsetshaugen, The type of dominating wave (Wind wave/Swell) can be predicted from the significant through an equation which include the significant wave height and a certain threshold, therefore, it is decided that the wind wave period, swell period and consequently the true wind wave direction and true swell direction is to be dropped.<br>
-Drop The True Wind and True North direction and replace with the True North direction

In [None]:
df_ship2 = df_ship2.drop(['waveheight','swellheight','windwaveheight',
                        'windwaveperiod','swellperiod',
                        'truewindwavedir','trueswelldir',
                        'truecurrentdir','truewinddir'],axis=1)
correlation_heatmap(df_ship2,.6)

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update(plt.rcParamsDefault)
df_ship2.hist(bins=50,figsize=(15,10))
plt.show()

In [None]:
stats = df_ship2.describe()
print(stats)

Imputing using KNN so that the Random Forest Regressor may function <br>
RFR cannot accept nan values <br>
Imputed data is then transformed

In [None]:
import numpy as np
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=50)
imputer.fit(df_ship2)


In [None]:
X = imputer.transform(df_ship2)

In [None]:
df_ship2tr = pd.DataFrame(X, columns=df_ship2.columns, index=df_ship2.index)

In [None]:
print(df_ship2tr.info())

Data Split for training and test data

In [None]:
# Implement the Random forest, but first, data is to be split into training and validation data

from sklearn.model_selection import train_test_split
import time 

# Dropping some feature can be done here to (possibly) increase the model's performance 
x_train,x_test,y_train,y_test = train_test_split(df_ship2tr.drop(['sog']
                                                                 ,axis=1)
                                                                ,df_ship2tr.sog,test_size=0.2,random_state=42)


In [None]:
print("Amount of the x training dataset is:",len(x_train))

In [None]:
print("Amount of the test dataset is",len(x_test))

Modelling using Random Forest Regressor 

In [None]:
from sklearn.ensemble import RandomForestRegressor
model_rfr_ftr = RandomForestRegressor(random_state=42)
start_rf = time.time()
model_rfr_ftr.fit(x_train,y_train)
end_rf = time.time()
print(f"Training time: {end_rf-start_rf:0.4}s ")

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
start_rf = time.time()
model_rf = rf.fit(x_train,y_train)
end_rf = time.time()
print(f"Training time: {end_rf-start_rf:0.4}s ")

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# model_rfr_ftr_hpo = RandomForestRegressor(n_estimators=300,
#                                 #   max_features=9,
#                                   random_state=42, 
#                                   )
# model_rfr_ftr_hpo.fit(x_train,y_train)

HPO RFR for validation data

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# model_rfr_ftr_hpov = RandomForestRegressor(n_estimators = 400,
#                                             min_samples_split = 3,
#                                             min_samples_leaf = 1,
#                                             # max_features = 9,
#                                             max_depth=170,
#                                             # bootstrap=False,
#                                             random_state=42)

# start_rfo = time.time()
# model_rfr_ftr_hpov.fit(x_train,y_train)
# end_rfo = time.time()
# print(f"Training time: {end_rfo-start_rfo:0.4}s ")

from sklearn.ensemble import RandomForestRegressor
model_rfr_ftr_hpov = RandomForestRegressor(n_estimators = 900,
                                            min_samples_split = 2,
                                            min_samples_leaf = 2,
                                            # max_features = 9,
                                            max_depth=200,
                                            # bootstrap=False,
                                            random_state=42)

start_rfo = time.time()
model_rfr_ftr_hpov.fit(x_train,y_train)
end_rfo = time.time()
print(f"Training time: {end_rfo-start_rfo:0.4}s ")

Modelling using ETR

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

model_etr = ExtraTreesRegressor(random_state=42)
start_et = time.time()
model_etr.fit(x_train,y_train)
end_et = time.time()
print(f"Training time: {end_et-start_et:0.4}s ")

HPO ETR for validation data

In [None]:
# from sklearn.ensemble import ExtraTreesRegressor

# model_etr_hpov = ExtraTreesRegressor(random_state=42,
#                                 n_estimators=300,
#                                 max_depth=120,
#                                 min_samples_split=4,
#                                 )
# start_eto = time.time()
# model_etr_hpov.fit(x_train,y_train)
# end_eto = time.time()
# print(f"Training time: {end_eto-start_eto:0.4}s")

from sklearn.ensemble import ExtraTreesRegressor

model_etr_hpov = ExtraTreesRegressor(random_state=42,
                                n_estimators=600,
                                max_depth=10,
                                )
start_eto = time.time()
model_etr_hpov.fit(x_train,y_train)
end_eto = time.time()
print(f"Training time: {end_eto-start_eto:0.4}s")

Modelling using Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
model_dtr = DecisionTreeRegressor()
start_dtr = time.time()
model_dtr.fit(x_train,y_train)
end_dtr = time.time()
print(f"Training time: {end_dtr-start_dtr:0.4}s")

HPO DTR for validation data

In [None]:
# from sklearn.tree import DecisionTreeRegressor
# model_dtr_hpov = DecisionTreeRegressor(min_samples_split=3,
#                                   min_samples_leaf=10,
#                                   max_features=12,
#                                   max_depth=80)
# start_dtro = time.time()
# model_dtr_hpov.fit(x_train,y_train)
# end_dtro = time.time()
# print(f"Training time: {end_dtro-start_dtro:0.4}s")

from sklearn.tree import DecisionTreeRegressor
model_dtr_hpov = DecisionTreeRegressor(min_samples_split=2,
                                  min_samples_leaf=3,
                                  max_features=12,
                                  max_depth=10)
start_dtro = time.time()
model_dtr_hpov.fit(x_train,y_train)
end_dtro = time.time()
print(f"Training time: {end_dtro-start_dtro:0.4}s")

Modelling using Multiple Linear Regression

In [None]:
from sklearn import linear_model
model_mlr = linear_model.LinearRegression()
start_mlr = time.time()
model_mlr.fit(x_train,y_train)
end_mlr = time.time()
print(f"Training time: {end_mlr-start_mlr:0.4}s")

Model Evaluation

In [None]:
# Plot tree using graphviz

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz

dot_data = tree.export_graphviz(model_rfr_ftr.estimators_[5], 
                  feature_names=x_train.columns.values.tolist(),  
                #   class_names=class_names,  
                  filled=True, rounded=True,  
                  special_characters=True,
                   out_file=None,
                   max_depth=3,
                           )

# os.system('dot -Tpng tree.dot -o tree.png')
graph = graphviz.Source(dot_data)
graph
graph.format = "png"
graph.render("rf_tree_it5")

In [None]:
def model_importance(model_type):
    mod_imp = model_type.feature_importances_
    df_mod_imp = pd.DataFrame(mod_imp,index= x_train.columns,columns=["Importance"])
    print(df_mod_imp)

model_importance(model_rfr_ftr)

Evaluation of RFR

In [None]:
def evaluate(model, features_x, labels_y):
    from sklearn.model_selection import cross_val_score

    score_r2 = cross_val_score(model,features_x,labels_y,
                           scoring='r2',cv=10)
    rsquared = score_r2.mean()

    score_expVar = cross_val_score(model,features_x,labels_y,
                           scoring='explained_variance',cv=10)
    expVar = score_expVar.mean()

    score_MAE = cross_val_score(model,features_x,labels_y,
                           scoring='neg_mean_absolute_error',cv=10)
    MAE = -score_MAE.mean()

    score_MAD = cross_val_score(model,features_x,labels_y,
                           scoring='neg_median_absolute_error',cv=10)
    MAD = -score_MAD.mean()

    score_RMSE = cross_val_score(model,features_x,labels_y,
                           scoring='neg_root_mean_squared_error',cv=10)
    RMSE = -score_RMSE.mean()

    print(f"Model Performance of {model}")
    print(f"R^2: {rsquared:0.4f}")
    print(f"explained Variance = {expVar:0.4f}")
    print(f"MAE = {MAE:0.4f}")
    print(f"RMSE = {RMSE:0.4f}")
    print(f"MAD = {MAD:0.4f}\n")

    return score_r2,score_expVar,score_MAE,score_MAD,score_RMSE    

In [None]:
r2_rf,expVar_rf,MAE_rf,RMSE_rf,MAD_rf = evaluate(model_rfr_ftr,x_test,y_test)

Evaluation of ETR

In [None]:
r2_et,expVar_et,MAE_et,RMSE_et,MAD_et = evaluate(model_etr,x_test,y_test)

Evaluation of DTR

In [None]:
r2_dt,expVar_dt,MAE_dt,RMSE_dt,MAD_dt = evaluate(model_dtr,x_test,y_test)

Evaluation of MLR 

In [None]:
r2_mlr,expVar_mlr,MAE_mlr,RMSE_mlr,MAD_mlr = evaluate(model_mlr,x_test,y_test)

In [None]:
cdf = pd.DataFrame(model_mlr.coef_, x_test.columns, columns=['Coefficients'])
print(cdf)

Prediction vs Actual

In [None]:
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
})

In [None]:
# fit a y = mx + c for the predicted vs actual  

from numpy import arange
from scipy.optimize import curve_fit

# plt.figure(figsize=(5,5))

# define the true objective function
def objective(x, a, b ):
	return a * x + b

def label_predict(model,test_features):
    predictions = model.predict(test_features)
    return predictions

def pred_plot(model,test_feature,test_label):
    
    plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    })
    plt.figure(figsize=(5,5))
    predict = label_predict(model,test_feature)

    from sklearn.metrics import r2_score

    print(r2_score(test_label, predict))

    # choose the input and output variables
    x, y = test_label, predict

    # curve fit
    popt, _ = curve_fit(objective, x, y)


    # summarize the parameter values
    a, b = popt
    print('y = %.5f * x + %.5f' % (a, b ))

    # plot input vs output
    
    plt.scatter(x, y,edgecolors='black')
    
    # define a sequence of inputs between the smallest and largest known inputs
    
    x_line = arange(min(x), max(x), 1)
        
    # calculate the output for the range
    
    y_line = objective(x_line, a, b)
    
    # create a line plot for the mapping function
    
    plt.plot(x_line, y_line, color='red')
    plt.xlabel(r'Predicted SOG [$m/s$]', fontsize=10)
    plt.ylabel(r'Actual SOG [$m/s$]', fontsize=10)
    plt.grid(linestyle = '--', linewidth = 0.5)
    plt.xlim(0,25)
    plt.ylim(0,25)
    plt.show()

pred_plot(model_dtr,x_test,y_test)

In [None]:
pred_plot(model_rfr_ftr,x_test,y_test)

In [None]:
pred_plot(model_etr,x_test,y_test)

In [None]:
pred_plot(model_mlr,x_test,y_test)

Understanding the effects of different hyperparameter optimisation on model's performance

Function to plot effect of number of features on RMSE

In [None]:
# explore random forest number of features effect on performance
def feature_curve(x,y,regressor):
	from numpy import mean
	from numpy import std
	from sklearn.model_selection import cross_val_score
	from sklearn.model_selection import KFold

	plt.rcParams.update({
	"text.usetex": True,
	"font.family": "serif",
	})

	# get a list of models to evaluate
	def get_modelsftr():
		models_ftr = dict()
		# explore number of features from 1 to 13
		for n in range(1,13):
			if regressor == 'dt':
				models_ftr[str(n)] = DecisionTreeRegressor(max_features=n)
			elif regressor == 'rf':
				models_ftr[str(n)] = RandomForestRegressor(max_features=n)
			elif regressor == 'et':
				models_ftr[str(n)] = ExtraTreesRegressor(max_features=n)	
		return models_ftr

	# evaluate a given model using cross-validation
	def evaluate_model(model, x, y):
		# define the evaluation procedure
		cv = KFold(n_splits=10, shuffle=True, random_state=1)
		# evaluate the model and collect the results
		scores = cross_val_score(model, x, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
		# negative scores due to scoring mechanism of sklearn
		return -scores

	# get the models to evaluate
	models_ftr = get_modelsftr()
	# evaluate the models and store results
	results_ftr, names_ftr = list(), list()
	for name, model in models_ftr.items():
		# evaluate the model
		scores_ftr = evaluate_model(model, x, y)
		# store the results
		results_ftr.append(scores_ftr)
		names_ftr.append(name)
		# summarize the performance along the way
		print('>%s %.3f (%.3f)' % (name, mean(scores_ftr), std(scores_ftr)))
	
	# Calculate mean for the x value of the plot

	mean_ftr = np.mean(results_ftr,axis=1)
	min_error_ftr = np.min(mean_ftr)
	bst_n_estimators = np.argmin(mean_ftr) 

	print(f"The minimum RMSE obtained is {min_error_ftr:.3f}")

	plt.plot(names_ftr,mean_ftr,"b.-")
	plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error_ftr], "k--",linewidth=1)
	plt.plot([-1, 12], [min_error_ftr, min_error_ftr], "k--",linewidth=1)
	plt.plot(bst_n_estimators, min_error_ftr, "ko",linewidth = 1)
	plt.text(bst_n_estimators, min_error_ftr*1.1, "Minimum", ha="center", fontsize=12)
	plt.grid(linestyle = '--', linewidth = 0.5)
	plt.xlim(0,12)
	plt.ylim(0,3)
	plt.xlabel("Number of features")
	plt.ylabel("RMSE")
	plt.title("Validation error", fontsize=13)
	plt.text(4,2.5, r'def_param : \tt{max_features = n_features}', bbox={'facecolor' : 'white','alpha':0.5})
	plt.show()

In [None]:
feature_curve(x_test,y_test,regressor='dt')

In [None]:
feature_curve(x_test,y_test,regressor='rf')

In [None]:
feature_curve(x_test,y_test,regressor='et')

Explore effect of number of trees on RMSE

This is not relevant for decision tree, as we are looking into amount of trees in a forest. Decision Tree handles only with single tree.

In [None]:
# explore random forest and extra tree number of trees effect on performance
def trees_curve(x,y,regressor):
	from numpy import mean
	from numpy import std
	from sklearn.model_selection import cross_val_score

	# get a list of models to evaluate
	def get_models_tree():
		models_tree = dict()
		# define number of trees to consider
		n_trees = [1,10,100,200,300,400,500,600,700,800,900,1000]
		for n in n_trees:
			if regressor == 'rf':
				models_tree[str(n)] = RandomForestRegressor(n_estimators = n)
			elif regressor == 'et':
				models_tree[str(n)] = ExtraTreesRegressor(n_estimators = n)	
		return models_tree

	# evaluate a given model using cross-validation
	def evaluate_model(model_tree, x, y):
		scores = cross_val_score(model, x, y, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)
		return -scores

	# # define dataset
	# get the models to evaluate
	models_tree = get_models_tree()

	# evaluate the models and store results
	results_tree, names_tree = list(), list()
	for name, model in models_tree.items():
		# evaluate the model
		scores_tree = evaluate_model(model, x, y)
		# store the results
		results_tree.append(scores_tree)
		names_tree.append(name)
		# summarize the performance along the way
		print('>%s %.3f (%.3f)' % (name, mean(scores_tree), std(scores_tree)))


	mean_tree = np.mean(results_tree,axis=1)
	min_error_tree = np.min(mean_tree)
	print(min_error_tree)
	bst_n_estimators_tree = np.argmin(mean_tree)
	print(f"The minimum RMSE obtained is {min_error_tree:.3f}")
	plt.plot(names_tree,mean_tree,"b.-")
	plt.plot([bst_n_estimators_tree, bst_n_estimators_tree], [0, min_error_tree], "k--",linewidth=1)
	plt.plot([-1, 12-1], [min_error_tree, min_error_tree], "k--",linewidth=1)
	plt.plot(bst_n_estimators_tree, min_error_tree, "ko",linewidth = 1)
	plt.text(bst_n_estimators_tree, min_error_tree*1.2, "Minimum", ha="center", fontsize=12)
	plt.grid(linestyle = '--', linewidth = 0.5)
	plt.xlim(0,12-1)
	plt.ylim(0,3)
	plt.xlabel("Number of Trees")
	plt.ylabel("RMSE")
	plt.title("Validation error", fontsize=13)
	plt.text(4,2.5, r'def_param : \tt{n_estimators = 100}', bbox={'facecolor' : 'white','alpha':0.5})
	plt.show()

In [None]:
trees_curve(x_test,y_test,regressor='rf')

In [None]:
trees_curve(x_test,y_test,regressor='et')

Explore effect of tree depth on RMSE

In [None]:
# explore tree based , tree depth. effect on performance
def depth_curve(x,y,regressor):
	from numpy import mean
	from numpy import std
	from sklearn.model_selection import cross_val_score
	from sklearn.model_selection import KFold


	# get a list of models to evaluate
	def get_models_dp():
		models_dp = dict()
		# consider tree depths from 1 to 7 and None=full
		depths = [1,2,3,4,5,6,7,8,9,10,100] + [None]
		for n in depths:
			if regressor == 'dt':
				models_dp[str(n)] = DecisionTreeRegressor(max_depth=n)
			elif regressor == 'rf':
				models_dp[str(n)] = RandomForestRegressor(max_depth=n)
			elif regressor == 'et':
				models_dp[str(n)] = ExtraTreesRegressor(max_depth=n)	
		return models_dp

	# evaluate a given model using cross-validation
	def evaluate_model(model, x, y):
		# define the evaluation procedure
		cv = KFold(n_splits=10, shuffle=True, random_state=1)
		# evaluate the model and collect the results
		scores = cross_val_score(model, x, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
		# negative scores due to scoring mechanism of sklearn
		return -scores

	# get the models to evaluate
	models_dp = get_models_dp()
	# evaluate the models and store results
	results_dp, names_dp = list(), list()
	for name, model in models_dp.items():
		# evaluate the model
		scores_dp = evaluate_model(model, x, y)
		# store the results
		results_dp.append(scores_dp)
		names_dp.append(name)
		# summarize the performance along the way
		print('>%s %.3f (%.3f)' % (name, mean(scores_dp), std(scores_dp)))


	mean_dp = np.mean(results_dp,axis=1)
	min_error_dp = np.min(mean_dp)
	print(min_error_dp)
	bst_n_estimators_dp= np.argmin(mean_dp)
	print(f"The minimum RMSE obtained is {min_error_dp:.3f}")
	plt.plot(names_dp,mean_dp,"b.-")
	plt.plot([bst_n_estimators_dp, bst_n_estimators_dp], [0, min_error_dp], "k--",linewidth=1)
	plt.plot([-1, 12-1], [min_error_dp, min_error_dp], "k--",linewidth=1)
	plt.plot(bst_n_estimators_dp, min_error_dp, "ko",linewidth = 1)
	plt.text(bst_n_estimators_dp, min_error_dp*1.2, "Minimum", ha="center", fontsize=12)
	plt.grid(linestyle = '--', linewidth = 0.5)
	plt.xlim(0,12-1)
	plt.ylim(0,3)
	plt.xlabel("Tree Depth")
	plt.ylabel("RMSE")
	plt.title("Validation error", fontsize=13)
	plt.text(4,2.5, r'def_param : \tt{max_depth = None}', bbox={'facecolor' : 'white','alpha':0.5})
	plt.show()

In [None]:
depth_curve(x_test,y_test,regressor='dt')

In [None]:
depth_curve(x_test,y_test,regressor='rf')

In [None]:
depth_curve(x_test,y_test,regressor='et')

Learning Curves

In [None]:
# Functions for RMSE and Rsquared plots
from sklearn.model_selection import LearningCurveDisplay, ShuffleSplit ,learning_curve

def learn_plotrmse(model,x_data,y_data,model_name):
    train_sizes, train_scores, test_scores = learning_curve(estimator = model, X=x_data, y=y_data,
                                                        cv=5, train_sizes=np.linspace(0.1, 1.0, 20),
                                                        n_jobs=-1,scoring="neg_root_mean_squared_error")
    plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    })
    #
    # Calculate training and test mean and std
    #
    train_mean = -np.mean(train_scores, axis=1)
    train_std = -np.std(train_scores, axis=1)
    test_mean = -np.mean(test_scores, axis=1)
    test_std = -np.std(test_scores, axis=1)
    #
    # Plot the learning curve
    #
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label=r'Training Error')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', marker='d', markersize=5, linestyle='--', label=r'Validation Error')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.title(fr'Learning Curve of {model_name}')
    plt.xlabel(r'Training Data Size')
    plt.ylabel(r'RMSE')
    plt.ylim(0,2)
    plt.grid(linestyle = '--', linewidth = 0.5)
    plt.legend(loc='lower right')
    plt.show()
def learn_plotr2(model,x_data,y_data,model_name):
    train_sizes, train_scores, test_scores = learning_curve(estimator = model, X=x_data, y=y_data,
                                                        cv=5, train_sizes=np.linspace(0.1, 1.0, 20),
                                                        n_jobs=-1,scoring="r2")
    plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    })
    #
    # Calculate training and test mean and std
    #
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    #
    # Plot the learning curve
    #
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label=r'Training Score')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', marker='d', markersize=5, linestyle='--', label=r'Validation Score')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.title(fr'Learning Curve of {model_name}')
    plt.xlabel(r'Training Data Size')
    plt.ylabel(r'Rsquared')
    plt.ylim(0.5,1)
    plt.grid(linestyle = '--', linewidth = 0.5)
    plt.legend(loc='lower right')
    plt.show()


Learning Curve DTR

In [None]:
learn_plotrmse(model_dtr,x_train,y_train,'DTR')
learn_plotr2(model_dtr,x_train,y_train,'DTR')

Learning Curve HPO DTR

In [None]:
learn_plotrmse(model_dtr_hpov,x_train,y_train,'DTR + HPO')
learn_plotr2(model_dtr_hpov,x_train,y_train,'DTR + HPO')

Learning Curve RFR

In [None]:
learn_plotrmse(model_rfr_ftr,x_train,y_train,'RFR')
learn_plotr2(model_rfr_ftr,x_train,y_train,'RFR')

Learning Curve RFR + HPO

In [None]:
learn_plotrmse(model_rfr_ftr_hpov,x_train,y_train,'RFR + HPO')
learn_plotr2(model_rfr_ftr_hpov,x_train,y_train,'RFR + HPO')

Learning Curve ETR

In [None]:
learn_plotrmse(model_etr,x_train,y_train,'ETR')
learn_plotr2(model_etr,x_train,y_train,'ETR')

Learning Curve ETR + HPO

In [None]:
learn_plotrmse(model_etr_hpov,x_train,y_train,'ETR + HPO')
learn_plotr2(model_etr_hpov,x_train,y_train,'ETR + HPO')

Box plot for evaluations

In [None]:
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
})
scores = [r2_rf,r2_et,r2_dt,r2_mlr]
fig = plt.figure(figsize=(5,3))
# Creating axes instance
# ax = fig.add_axes([0, 0, 1, 1])
fig,ax = plt.subplots()
plt.title("Test Data Model Performance")
plt.ylabel("R-squared")
plt.xlabel("Model")
plt.yscale('linear')
plt.ylim(0.2,1.0)
plt.boxplot(scores,showmeans=True)
plt.grid(axis='y',linestyle = '--', linewidth = 0.5)
ax.set_xticklabels(['RFR', 'ETR',
                    'DTR', 'MLR'])
plt.show()

In [None]:
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
})
scores_rmse = [np.abs(RMSE_rf),np.abs(RMSE_et),np.abs(RMSE_dt),np.abs(RMSE_mlr)]
fig = plt.figure(figsize=(5,3))
# Creating axes instance
fig,ax = plt.subplots() 
plt.title("Test Data Model Performance")
plt.ylabel("RMSE")
plt.xlabel("Model")
plt.ylim(0,1.2)
plt.yscale('linear')
ax.boxplot(scores_rmse,showmeans=True)
plt.grid(axis='y',linestyle = '--', linewidth = 0.5)
ax.set_xticklabels(['RFR', 'ETR',
                    'DTR', 'MLR'])
plt.show()

Saving model to local directory

In [None]:
# # Saving the model to local directory

# filename = 'savemodel_rfr_ftr.sav'
# joblib.dump(model_rfr_ftr,filename)

# filename = 'savemodel_rf.sav'
# joblib.dump(model_rf,filename)

# # filename = 'savemodel_rfr_ftr_hpo.sav'
# # joblib.dump(model_rfr_ftr_hpo,filename)

# filename = 'savemodel_rfr_ftr_hpov.sav'
# joblib.dump(model_rfr_ftr_hpov,filename)

# filename = 'savemodel_etr.sav'
# joblib.dump(model_etr,filename)

# filename = 'savemodel_etr_hpov.sav'
# joblib.dump(model_etr_hpov,filename)

# filename = 'savemodel_dtr_ftr.sav'
# joblib.dump(model_dtr,filename)

# filename = 'savemodel_dtr_hpov.sav'
# joblib.dump(model_dtr_hpov,filename)

# filename = 'savemodel_mlr_ftr.sav'
# joblib.dump(model_mlr,filename)