# Machine learing models

In this sheet, sample codes of machine learning models (xgboost, randomforest, ann) created for crest office. Similar way another office dataset were splitted into 3, 6, 9 and 12 month dataset to see how the models perform for different size of datasets. 3 algorithms were used to create temperature, CO2 and TVOC models. Scikit learn library is used for the models.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import sklearn as sk
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, r2_score,mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

In [None]:
import os
os.getcwd()

In [None]:
#change working directory"
os.chdir("D:\\OneDrive - Loughborough University\\PhD\\Case study\\Crest\\data_ai_model")

# XGBOOST Model - AQ 1 temperature

First is the xgboost model. Xgboost is the gradient booted tree model which is a type of an ensemble learning. It works by sequentially adding weak models, each aim to correct the errors made by the previous models.It is more faster and accurate than deep learning models.

In [None]:
#Import the data
data = pd.read_csv("crestdataformodel_2.csv")

In [None]:
#Copy the dataset so we dont make any changes to the original data
df_aq1temp = data.copy(deep=True)

In [None]:
#drop the unrelevant columns
df_aq1temp.drop(columns=['timestamp','co2_aq1','co2_aq2','co2_aq3','co2_aq4','humi_aq1','humi_aq2','humi_aq3','humi_aq4','tvoc_aq1','tvoc_aq2','tvoc_aq3','tvoc_aq4','temp_aq2','temp_aq3'], inplace=True)

In [None]:
df_aq1temp.head()

In [None]:
#Check the shape of the dataset
df_aq1temp.shape

In [None]:
#Check the null values in the dataset
df_aq1temp.isna().sum()

In [None]:
#Drop the NaN values
df_aq1temp = df_aq1temp[df_aq1temp['temp_aq1'].notna()]
df_aq1temp = df_aq1temp[df_aq1temp['temp_aq4'].notna()]

In [None]:
#Verify whether the NaN values are dropped
df_aq1temp.isna().sum()

In [None]:
#Shape of the dataset
df_aq1temp.shape

## Train - Test split

In [None]:
#Assign x and y variables

x = df_aq1temp[[]]
y = df_aq1temp[[]]

In [None]:
#Split train test set

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15,random_state=42)

In [None]:
#Split training set to train and validation set

x_train_oly, x_val, y_train_oly, y_val = train_test_split(x_train, y_train, test_size=0.15, random_state=42)

In [None]:
x_train_oly.shape

In [None]:
x_val.shape

In [None]:
x_test.shape

## Fit the model

In [None]:
dtrain = xgb.DMatrix(x_train_oly, label=y_train_oly)

In [None]:
dtrain

In [None]:
reg_aq1_temp = xgb.XGBRegressor(n_estimators = 100, max_depth = 10, eval_metric = mean_squared_error, random_state = 42)

In [None]:
#Fit the Xgboost model
reg_aq1_temp.fit(x_train_oly, y_train_oly, eval_set = [(x_val, y_val)])

In [None]:
#Parameters of the model
reg_aq1_temp.score

In [None]:
#check r2 value of validation set
r2_score(reg_aq1_temp.predict(x_val), y_val)

In [None]:
#Check the mse of validation set
mean_squared_error(reg_aq1_temp.predict(x_val), y_val)

In [None]:
#Check the mape of validation set
mean_absolute_percentage_error(reg_aq1_temp.predict(x_val), y_val)

## Hyperparameter optimisation

In [None]:
#set the scoring and parameter grid
scoring = {'mse': make_scorer(mean_squared_error,greater_is_better=False)}
param_grid = { 
    "learning_rate": [0.0001,0.001, 0.01, 0.1,0.3, 1] ,
    "n_estimators" : range (50, 400, 10),
    "max_depth": range (2, 20, 1),
    "gamma": [i/10.0 for i in range(0,5)],
    "colsample_bytree": [i/10.0 for i in range(3,10)],
    "reg_lambda": [1e-5, 1e-2, 0.1, 1, 10, 100]
}

In [None]:
#assign the cv
cv = ShuffleSplit(n_splits=5, test_size=0.2,random_state=42)

In [None]:
random_search_aq1_temp = RandomizedSearchCV(estimator=reg_aq1_temp, 
                           param_distributions=param_grid, 
                           n_iter=500,
                           scoring=scoring, 
                           refit='mse',
                           n_jobs=-1, 
                           cv=cv, 
                           verbose=2)

In [None]:
#Hyperparameter optimisation for find the best possible hyperparameter for the model
%%time
random_search_aq1_temp.fit(x_train_oly, y_train_oly)

In [None]:
#Display the best hyperparameters from the optimisation
random_search_aq1_temp.best_params_

In [None]:
#Check the mse, mape and r2 for validation and test set using optimised model

mse_val_random_search_aq1_temp = mean_squared_error(random_search_aq1_temp.predict(x_val), y_val)

mape_val_random_search_aq1_temp = mean_absolute_percentage_error(random_search_aq1_temp.predict(x_val), y_val)

r2_val_random_search_aq1_temp = r2_score(random_search_aq1_temp.predict(x_val), y_val)

mse_test_random_search_aq1_temp = mean_squared_error(random_search_aq1_temp.predict(x_test), y_test)

mape_test_random_search_aq1_temp = mean_absolute_percentage_error(random_search_aq1_temp.predict(x_test), y_test)

r2_test_random_search_aq1_temp = r2_score(random_search_aq1_temp.predict(x_test), y_test)

mse_val_reg_aq1_temp = mean_squared_error(reg_aq1_temp.predict(x_test), y_test)

mape_val_reg_aq1_temp = mean_absolute_percentage_error(reg_aq1_temp.predict(x_test), y_test)

r2_val_reg_aq1_temp = r2_score(reg_aq1_temp.predict(x_test), y_test)

In [None]:
print(mse_val_random_search_aq1_temp)
print(mape_val_random_search_aq1_temp)
print(r2_val_random_search_aq1_temp)
print(mse_test_random_search_aq1_temp)
print(mape_test_random_search_aq1_temp) 
print(r2_test_random_search_aq1_temp)
print(mse_val_reg_aq1_temp)
print(mape_val_reg_aq1_temp)
print(r2_val_reg_aq1_temp)

In [None]:
#Plot prediction vs validation data
plot_df = pd.DataFrame([random_search_aq1_temp.predict(x_val), y_val.temp_aq1.values]).T
plot_df.rename(columns={0:'prediction', 1:'actual'}, inplace=True)
plot_df

In [None]:
#plot prediction vs test data
plot_df_1 = pd.DataFrame([random_search_aq1_temp.predict(x_test), y_test.temp_aq1.values]).T
plot_df_1.rename(columns={0:'prediction', 1:'actual'}, inplace=True)
plot_df_1

In [None]:
#Export the model as a pickle file
import pickle
with open('xgb_temp_aq1.pkl','wb') as f:
    pickle.dump(random_search_aq1_temp,f)

In [None]:
#Import the model
import pickle
with open('xgb_temp_aq9_9month.pkl', 'rb') as f:
    regwww_temp9 = pickle.load(f)

In [None]:
from xgboost import plot_tree
import matplotlib.pyplot as plt

# Plot the first tree
plot_tree(regwww_temp9, num_trees=0)
plt.show()


# RANDOM FOREST Model FOR AQ 1 Temperature

Random forest is an ensemble of decision trees. It is easier to use and less prone to overfitting. 

In [None]:
reg_aq1_temp1 = RandomForestRegressor(n_estimators = 100, max_depth = 10, random_state = 42)

In [None]:
#fit the randomforest model
reg_aq1_temp1.fit(x_train_oly, y_train_oly)

In [None]:
#Check the parameters
reg_aq1_temp1.score

In [None]:
#check r2 value of validation set
r2_score(reg_aq1_temp1.predict(x_val), y_val)

In [None]:
#check meansquarederror value of validation set
mean_squared_error(reg_aq1_temp1.predict(x_val), y_val)

In [None]:
#check mape value of validation set
mean_absolute_percentage_error(reg_aq1_temp1.predict(x_val), y_val)

In [None]:
from scipy.stats import randint
param_grid = {
    'bootstrap': [True, False],
    'max_depth': randint(1 , 100),
    'max_features': ['auto', 'sqrt', 'log2', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 8, 10],
    'n_estimators': randint(50 , 400)}

In [None]:
scoring = {'mse': make_scorer(mean_squared_error,greater_is_better=False)}

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2,random_state=42)

In [None]:
random_search_aq1_temp1 = RandomizedSearchCV(estimator=reg_aq1_temp1, 
                           param_distributions=param_grid, 
                           n_iter=100,
                           scoring=scoring, 
                           refit='mse',
                           n_jobs=-1, 
                           cv=cv, 
                           verbose=2)

In [None]:
%%time
random_search_aq1_temp1.fit(x_train_oly, y_train_oly)

In [None]:
#best hyperparameter from the optimised model
random_search_aq1_temp1.best_params_

In [None]:
#Check the mse, mape and r2 for validation and test set using optimised model

mse_val_random_search_aq1_temp1 = mean_squared_error(random_search_aq1_temp1.predict(x_val), y_val)

mape_val_random_search_aq1_temp1 = mean_absolute_percentage_error(random_search_aq1_temp1.predict(x_val), y_val)

r2_val_random_search_aq1_temp1 = r2_score(random_search_aq1_temp1.predict(x_val), y_val)

mse_test_random_search_aq1_temp1 = mean_squared_error(random_search_aq1_temp1.predict(x_test), y_test)

mape_test_random_search_aq1_temp1 = mean_absolute_percentage_error(random_search_aq1_temp1.predict(x_test), y_test)

r2_test_random_search_aq1_temp1 = r2_score(random_search_aq1_temp1.predict(x_test), y_test)

mse_val_reg_aq1_temp1 = mean_squared_error(reg_aq1_temp1.predict(x_test), y_test)

mape_val_reg_aq1_temp1 = mean_absolute_percentage_error(reg_aq1_temp1.predict(x_test), y_test)

r2_val_reg_aq1_temp1 = r2_score(reg_aq1_temp1.predict(x_test), y_test)

In [None]:
print(mse_val_random_search_aq1_temp1)
print(mape_val_random_search_aq1_temp1)
print(r2_val_random_search_aq1_temp1)
print(mse_test_random_search_aq1_temp1)
print(mape_test_random_search_aq1_temp1) 
print(r2_test_random_search_aq1_temp1)
print(mse_val_reg_aq1_temp1)
print(mape_val_reg_aq1_temp1)
print(r2_val_reg_aq1_temp1)

In [None]:
plot_df1 = pd.DataFrame([random_search_aq1_temp1.predict(x_val), y_val.temp_aq1.values]).T
plot_df1.rename(columns={0:'prediction', 1:'actual'}, inplace=True)
plot_df1

In [None]:
plot_df1_1 = pd.DataFrame([random_search_aq1_temp1.predict(x_test), y_test.temp_aq1.values]).T
plot_df1_1.rename(columns={0:'prediction', 1:'actual'}, inplace=True)
plot_df1_1

In [None]:
#Export randomforest model
import pickle
with open('rf_temp_aq1.pkl','wb') as f:
    pickle.dump(random_search_aq1_temp1,f)

# ANN Model FOR AQ 1 Temperature

ANN is a deep learning model which uses multiple layers and neurons. 

## Fit the model

In [None]:
#Fit the ANN model
reg_aq1_temp2 = MLPRegressor(random_state=42, max_iter=500, hidden_layer_sizes=(10,), activation='relu')
reg_aq1_temp2.fit(x_train_oly, np.ravel(y_train_oly))

In [None]:
r2_score(reg_aq1_temp2.predict(x_val), y_val)

In [None]:
mean_squared_error(reg_aq1_temp2.predict(x_val), y_val)

In [None]:
mean_absolute_percentage_error(reg_aq1_temp2.predict(x_val), y_val)

## Hyperparameter Optimisation

In [None]:
scoring = {'mse': make_scorer(mean_squared_error,greater_is_better=False)}
param_grid = {'hidden_layer_sizes': [(10,), (8,), (15,), (10,10), (5,), (10,10,10), (50,50,50), (100,100,100), (100,100,100)],
          'activation': ['relu','tanh','logistic'],
          'alpha': [0.0001, 0.001, 0.01, 0.1, 0.05],
          'learning_rate': ['constant','adaptive'],
          'solver': ['adam', 'sgd']}

In [None]:
grid_search_aq9_temp2 = GridSearchCV(estimator=reg_aq9_temp2, 
                           param_grid=param_grid, 
                           scoring=scoring, 
                           refit='mse',
                           n_jobs=-1, 
                           cv=5, 
                           verbose=2)

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2,random_state=42)

In [None]:
random_search_aq1_temp2 = RandomizedSearchCV(estimator=reg_aq1_temp2, 
                           param_distributions=param_grid, 
                           n_iter=100,
                           scoring=scoring, 
                           refit='mse',
                           n_jobs=-1, 
                           cv=cv, 
                           verbose=2)

In [None]:
%%time
random_search_aq1_temp2.fit(x_train_oly, np.ravel(y_train_oly))

In [None]:
print("Best parameters found: ", grid_search_aq9_temp2.best_params_)
print("Best score: ", grid_search_aq9_temp2.best_score_)

In [None]:
random_search_aq1_temp2.best_params_

In [None]:
#Check the mse, mape and r2 for validation and test set using optimised model

mse_val_random_search_aq1_temp2 = mean_squared_error(random_search_aq1_temp2.predict(x_val), y_val)

mape_val_random_search_aq1_temp2 = mean_absolute_percentage_error(random_search_aq1_temp2.predict(x_val), y_val)

r2_val_random_search_aq1_temp2 = r2_score(random_search_aq1_temp2.predict(x_val), y_val)

mse_test_random_search_aq1_temp2 = mean_squared_error(random_search_aq1_temp2.predict(x_test), y_test)

mape_test_random_search_aq1_temp2 = mean_absolute_percentage_error(random_search_aq1_temp2.predict(x_test), y_test)

r2_test_random_search_aq1_temp2 = r2_score(random_search_aq1_temp2.predict(x_test), y_test)

mse_val_reg_aq1_temp2 = mean_squared_error(reg_aq1_temp2.predict(x_test), y_test)

mape_val_reg_aq1_temp2 = mean_absolute_percentage_error(reg_aq1_temp2.predict(x_test), y_test)

r2_val_reg_aq1_temp2 = r2_score(reg_aq1_temp2.predict(x_test), y_test)

In [None]:
print(mse_val_random_search_aq1_temp2)
print(mape_val_random_search_aq1_temp2)
print(r2_val_random_search_aq1_temp2)
print(mse_test_random_search_aq1_temp2)
print(mape_test_random_search_aq1_temp2) 
print(r2_test_random_search_aq1_temp2)
print(mse_val_reg_aq1_temp2)
print(mape_val_reg_aq1_temp2)
print(r2_val_reg_aq1_temp2)

In [None]:
plot_df2 = pd.DataFrame([random_search_aq1_temp2.predict(x_val), y_val.temp_aq1.values]).T
plot_df2.rename(columns={0:'prediction', 1:'actual'}, inplace=True)
plot_df2

In [None]:
plot_df2_1 = pd.DataFrame([random_search_aq1_temp2.predict(x_test), y_test.temp_aq1.values]).T
plot_df2_1.rename(columns={0:'prediction', 1:'actual'}, inplace=True)
plot_df2_1 

In [None]:
#Export the ANN model
import pickle
with open('mlp_temp_aq1.pkl','wb') as f:
    pickle.dump(random_search_aq1_temp2,f)