# CPE 695 - Applied A.I. Final Project Code

# Setting up the Variables & Reading Data

In [1]:
# Importing the libraries

# Turning off warnings
# https://queirozf.com/entries/suppressing-ignoring-warnings-in-python-reference-and-examples
# at the top of the file, before other imports
import warnings
warnings.filterwarnings('ignore')
# no warnings will be printed from now on

import numpy as np
import matplotlib.pyplot as plt
import pandas as pandas
from sklearn.metrics import mean_squared_error # does the mean squared error calculations
from numpy.linalg import inv # Takes the inverse of a matrix
from sklearn.model_selection import train_test_split # Splits the Training Data and Test Data
np.seterr(all='ignore') # overflow encountered in square python. Ignores errors 
from sklearn.linear_model import LinearRegression # linear regression algorithm class
from sklearn.tree import DecisionTreeClassifier # can use decision trees with this for Classification
from sklearn.tree import DecisionTreeRegressor # can use decision trees with this, for regression
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz # plots trees
from sklearn import tree # trees
from sklearn.tree import export_graphviz # used for graphing trees
import graphviz # used for graphing trees
import pydotplus # used for graphing dot files from graphviz into png
import io # used saving dot text into a string text input stream
from sklearn.model_selection import GridSearchCV # for pruning trees
from sklearn.metrics import classification_report # for generating classification reports for accuracy
from sklearn.ensemble import RandomForestClassifier # random forest trees classifier
from sklearn.ensemble import RandomForestRegressor # random forest trees regression
from sklearn.metrics import confusion_matrix # computes confusion_matrix
from sklearn.neural_network import MLPClassifier  # ANNs Classification
from sklearn.neural_network import MLPRegressor # ANN Regression
from sklearn.datasets import make_classification  # ANNs
from sklearn.metrics import accuracy_score # calculates accuracy scores
from sklearn import preprocessing
from sklearn.ensemble import BaggingRegressor
from sklearn import preprocessing 
import sklearn.metrics as sm # for calculating R2 score

# style of the plot grid
plt.style.use('seaborn-whitegrid')

# Read CSV File - researched online on how to use proper methods for reading CSV files
# https://www.kite.com/python/answers/how-to-set-column-names-when-importing-a-csv-into-a-pandas-dataframe-in-python
# Headers list: Date, Rented Bike Count, Hour, Temperature(°C), Humidity(%), Wind speed (m/s), Visibility (10m), 
#               Dew point temperature(°C), Solar Radiation (MJ/m2), Rainfall(mm), Snowfall (cm), Seasons, 
#               Holiday, Functioning Day
# Step 1: set up headers information
headers_list = ["Date", "Rented Bike Count", "Hour", "Temperature(°C)", "Humidity(%)", 
                "Wind speed (m/s)", "Visibility (10m)", "Dew point temperature(°C)", 
                "Solar Radiation (MJ/m2)", "Rainfall(mm)", "Snowfall (cm)", "Seasons", 
                "Holiday", "Functioning Day"
               ]

# Step 2: use Pandas read_csv function and populate a variable with the csv data. 
# https://stackoverflow.com/questions/54304551/python-csv-file-reading-turning-the-first-row-into-column-headers-nextreader
# Skip the first row, since it's a header
seoulbike_file_prime = pandas.read_csv('SeoulBikeData.csv', names=headers_list, skiprows=[0])
print('Finished reading the "SeoulBikeData.csv" file!')

seoulbike_file_prime.head(10) # prints the table's first 10 rows as a sample in a nice format. 

Finished reading the "SeoulBikeData.csv" file!


Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
5,01/12/2017,100,5,-6.4,37,1.5,2000,-18.7,0.0,0.0,0.0,Winter,No Holiday,Yes
6,01/12/2017,181,6,-6.6,35,1.3,2000,-19.5,0.0,0.0,0.0,Winter,No Holiday,Yes
7,01/12/2017,460,7,-7.4,38,0.9,2000,-19.3,0.0,0.0,0.0,Winter,No Holiday,Yes
8,01/12/2017,930,8,-7.6,37,1.1,2000,-19.8,0.01,0.0,0.0,Winter,No Holiday,Yes
9,01/12/2017,490,9,-6.5,27,0.5,1928,-22.4,0.23,0.0,0.0,Winter,No Holiday,Yes


# STEP 0: Functions and Regression Model Set-Up

In [2]:
# define a function to automatically get the predictions of the Linear Regression
def get_Linear_Reg_Predictions(X_train_def, X_test_def, y_train_def, bagging_switch):
    #https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
        
    # Linear Regressor 
    Linear_Regressor = LinearRegression()
    
    Bagged_Linear_Regressor = BaggingRegressor(base_estimator=Linear_Regressor, n_estimators=10, random_state=0)
        
    if (bagging_switch == True):
        # Fit the X and Y training data
        Linear_Regressor_Train = Bagged_Linear_Regressor.fit(X_train_def, y_train_def)
    elif (bagging_switch == False):
        # Fit the X and Y training data
        Linear_Regressor_Train = Linear_Regressor.fit(X_train_def, y_train_def)
    
    # Predict the Results
    Linear_Regressor_Predictions = Linear_Regressor_Train.predict(X_test_def)
    
    return Linear_Regressor_Predictions

# define a function to automatically get the GridSearchCV Parameters for Trees
def get_tree_params(X_train_def, X_test_def, y_train_def):
    
     # GridSearchCV Parameter Estimation
    params = {'max_leaf_nodes': list(range(2, 100)), 'max_depth': list(range(2, 100))}
    grid_search_cv = GridSearchCV(DecisionTreeRegressor(random_state=42), params, verbose=1, n_jobs=-1, cv=3)
    
    # fit the training data using GridSearchCV()
    grid_search_cv.fit(X_train_def, y_train_def)
    # finding the max parameter for max_leaf_nodes
    grid_search_cv_best_estimator = grid_search_cv.best_estimator_
    
    # extracting max_leaf_nodes out of the variable
    max_leaf_nodes_out = grid_search_cv_best_estimator.get_params(deep=True)['max_leaf_nodes']
    max_depth_out = grid_search_cv_best_estimator.get_params(deep=True)['max_depth']
    
    return max_leaf_nodes_out, max_depth_out

# define a function to automatically get the predictions of the Decision Tree
def get_Decision_Tree_Predictions(X_train_def, X_test_def, y_train_def, max_leaf_nodes, max_depth, bagging_switch):
    #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
    
    # Create a Decision Tree  
    Decision_Tree_Regressor = DecisionTreeRegressor(
                                    max_leaf_nodes = max_leaf_nodes, 
                                    max_depth = max_depth,
                                    random_state = 0
                                    )
    Bagged_Decision_Tree_Regressor = BaggingRegressor(base_estimator=Decision_Tree_Regressor, n_estimators=10, random_state=0)
    
    if (bagging_switch == True):
        # Fit the X and Y training data
        Decision_Tree_Regressor_Train = Bagged_Decision_Tree_Regressor.fit(X_train_def, y_train_def)
    elif (bagging_switch == False):
        # Fit the X and Y training data
        Decision_Tree_Regressor_Train = Decision_Tree_Regressor.fit(X_train_def, y_train_def)
    
    # Predict the Results
    Decision_Tree_Regressor_Predictions = Decision_Tree_Regressor_Train.predict(X_test_def)
    
    return Decision_Tree_Regressor_Predictions

# define a function to automatically get the predictions of the Random Forests
def get_Random_Forests_Predictions(X_train_def, X_test_def, y_train_def, max_leaf_nodes, max_depth, bagging_switch):
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
    
    # Create a Random Forests
    Random_Forest_Regressor = RandomForestRegressor(
                                    max_leaf_nodes = max_leaf_nodes, 
                                    max_depth = max_depth,
                                    random_state = 0
                                    )
    Bagged_Random_Forests_Regressor = BaggingRegressor(base_estimator=Random_Forest_Regressor, n_estimators=10, random_state=0)

    if (bagging_switch == True):
        # Fit the X and Y training data
        Random_Forest_Regressor_Train = Bagged_Random_Forests_Regressor.fit(X_train_def, y_train_def)
    elif (bagging_switch == False):
        # Fit the X and Y training data
        Random_Forest_Regressor_Train = Random_Forest_Regressor.fit(X_train_def, y_train_def)
    
    # Predict the Results
    Random_Forest_Regressor_Predictions = Random_Forest_Regressor_Train.predict(X_test_def)
    
    return Random_Forest_Regressor_Predictions    

# define a function to automatically get the GridSearchCV Parameters for ANN
def get_ANN_params(X_train_def, X_test_def, y_train_def):
    
     # GridSearchCV Parameter Estimation
    params = {'hidden_neurons': list(range(0, 10)), 
              'alpha': list(range(0.0, 1.0)),
              'learning_rate_init': list(range(0.0, 0.5)),
              'momentum': list(range(0.0, 1.0)),
             }
    grid_search_cv = GridSearchCV(MLPRegressor(random_state=42), params, verbose=1, n_jobs=-1, cv=3)
    
    # fit the training data using GridSearchCV()
    grid_search_cv.fit(X_train_def, y_train_def)
    # finding the max parameter for max_leaf_nodes
    grid_search_cv_best_estimator = grid_search_cv.best_estimator_
    
    # extracting max_leaf_nodes out of the variable
    hidden_neurons_out = grid_search_cv_best_estimator.get_params(deep=True)['hidden_neurons']
    alpha_out = grid_search_cv_best_estimator.get_params(deep=True)['alpha']
    learning_rate_init_out = grid_search_cv_best_estimator.get_params(deep=True)['learning_rate_init']
    momentum_out = grid_search_cv_best_estimator.get_params(deep=True)['momentum']
    
    return hidden_neurons_out, alpha_out, learning_rate_init_out, momentum_out

# define a function to automatically get the predictions of the ANN's MLPs
def get_ANN_Predictions(hidden_neurons, X_train_def, X_test_def, y_train_def, bagging_switch):
    # Resource links:
    # https://scikit-learn.org/stable/modules/neural_networks_supervised.html
    # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
    # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    # ANN - MLP Settings
    Hidden_Layers = (hidden_neurons, hidden_neurons) # Needs to be a tuple for MLP Class - use 2 layers for this project
    activation_setting = 'logistic' # the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).
    #solver_setting = 'adam' # refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba
    solver_setting = 'sgd' # refers to stochastic gradient descent.
    alpha_setting = 0.0000001 # L2 penalty (regularization term) parameter.
    learning_rate_setting = 'constant' # 'constant' is a constant learning rate given by 'learning_rate_init'.
    learning_rate_init_setting = 0.1 # The initial learning rate used. It controls the step-size in updating the weights.
    momentum_setting = 0.5 # Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.
    #early_stopping_setting = True # f set to true, it will automatically set aside 10% of training data as validation 
                                  # and terminate training when validation score is not improving by at least tol for 
                                  # n_iter_no_change consecutive epochs.

    # Create a Multi-Layer Perceptron Regressor 
    MLP_Regressor = MLPRegressor(hidden_layer_sizes = Hidden_Layers, 
                                   #activation = activation_setting, # gives same result if this is turned on. Turned off
                                   #solver = solver_setting,         # gives same result if this is turned on. Turned off
                                   alpha = alpha_setting, 
                                   learning_rate = learning_rate_setting, 
                                   learning_rate_init = learning_rate_init_setting,
                                   momentum = momentum_setting,
                                   #early_stopping = early_stopping_setting,
                                   random_state=1, max_iter=2500
                                  )
    Bagged_MLP_Regressor = BaggingRegressor(base_estimator=MLP_Regressor, n_estimators=10, random_state=0)

    if (bagging_switch == True):
        # Fit the X and Y training data
        MLP_Regressor_Train = Bagged_MLP_Regressor.fit(X_train_def, y_train_def)
    elif (bagging_switch == False):
        # Fit the X and Y training data
        MLP_Regressor_Train = MLP_Regressor.fit(X_train_def, y_train_def)
    
    # Predict the Results
    MLP_Regressor_Predictions = MLP_Regressor_Train.predict(X_test_def)
    #print("MLP_Regressor_Predictions:",MLP_Regressor_Predictions)
    
    return MLP_Regressor_Predictions

# STEP 1: Clean Up Data (Encoding + Delete Unnecessary Rows)

In [3]:
# STEP 1: Clean up Data
# Ignore Date. It is unimportant

# Convert strings to discrete values
le = preprocessing.LabelEncoder()

le.fit(seoulbike_file_prime['Seasons'])
seoulbike_file_prime['Seasons'] = le.transform(seoulbike_file_prime['Seasons'])

le.fit(seoulbike_file_prime['Holiday'])
seoulbike_file_prime['Holiday'] = le.transform(seoulbike_file_prime['Holiday'])

le.fit(seoulbike_file_prime['Functioning Day'])
seoulbike_file_prime['Functioning Day'] = le.transform(seoulbike_file_prime['Functioning Day'])

# initialize the seoulbike data - used as a way to prevent overwrites on existing copies
seoulbike_file = seoulbike_file_prime

# This section chooses the most impactful columns. Trims down the excess noise
# Tier List of Most Impactful Columns
# 1. Functioning Day
# 2. Season
# 3. Dew point temperature(°C)
# 4. Temperature(°C)
# 5. Humidity(%)
# 6. Rainfall(mm)
refined_seoulbike_columns = ["Rented Bike Count", 
                             "Functioning Day", 
                             "Seasons", "Dew point temperature(°C)",
                             "Temperature(°C)", "Humidity(%)", "Rainfall(mm)", 
                             
                             "Visibility (10m)", "Solar Radiation (MJ/m2)", "Hour", 
                             "Wind speed (m/s)", "Holiday", "Snowfall (cm)"
                            ]
refined_seoulbike = seoulbike_file[refined_seoulbike_columns]
before_shape = refined_seoulbike.shape[0]
print("Before removal of unnecessary data:",before_shape)



# REMOVING OUTLIERS/UNNECESSARY DATA
# Remove rows with 0 rented bikes. Remove the usage of Functioning Day
refined_seoulbike = refined_seoulbike[refined_seoulbike["Rented Bike Count"]>0]



after_shape = refined_seoulbike.shape[0]
print("After removal of unnecessary data:",refined_seoulbike.shape[0])
print("Removed",(before_shape-after_shape), "rows!")

refined_seoulbike.head(10) # prints the table's first 10 rows as a sample in a nice format.

Before removal of unnecessary data: 8760
After removal of unnecessary data: 8465
Removed 295 rows!


Unnamed: 0,Rented Bike Count,Functioning Day,Seasons,Dew point temperature(°C),Temperature(°C),Humidity(%),Rainfall(mm),Visibility (10m),Solar Radiation (MJ/m2),Hour,Wind speed (m/s),Holiday,Snowfall (cm)
0,254,1,3,-17.6,-5.2,37,0.0,2000,0.0,0,2.2,1,0.0
1,204,1,3,-17.6,-5.5,38,0.0,2000,0.0,1,0.8,1,0.0
2,173,1,3,-17.7,-6.0,39,0.0,2000,0.0,2,1.0,1,0.0
3,107,1,3,-17.6,-6.2,40,0.0,2000,0.0,3,0.9,1,0.0
4,78,1,3,-18.6,-6.0,36,0.0,2000,0.0,4,2.3,1,0.0
5,100,1,3,-18.7,-6.4,37,0.0,2000,0.0,5,1.5,1,0.0
6,181,1,3,-19.5,-6.6,35,0.0,2000,0.0,6,1.3,1,0.0
7,460,1,3,-19.3,-7.4,38,0.0,2000,0.0,7,0.9,1,0.0
8,930,1,3,-19.8,-7.6,37,0.0,2000,0.01,8,1.1,1,0.0
9,490,1,3,-22.4,-6.5,27,0.0,1928,0.23,9,0.5,1,0.0


# STEP 2: Splitting Data into Training/ Testing Dataset + NORMALIZATION

In [23]:
# STEP 2: Creating Independent and Dependent Data
# Create a subset of the table with indepdendent variables and dependent variables
subset_list_independent = [#"Functioning Day",
                           "Seasons", "Dew point temperature(°C)",
                           "Temperature(°C)", "Humidity(%)", "Rainfall(mm)", 
                           
                           "Visibility (10m)", "Solar Radiation (MJ/m2)", 
                           "Hour",
                           "Wind speed (m/s)", 
                           "Holiday", 
                           "Snowfall (cm)"
                          ]
subset_list_dependent = ['Rented Bike Count']
independent_seoulbike = refined_seoulbike[subset_list_independent]
dependent_seoulbike = refined_seoulbike[subset_list_dependent]

# STEP 2.1: # Take a random 80% samples for training and the rest 20% for test.
X_train, X_test, y_train, y_test = train_test_split(independent_seoulbike, dependent_seoulbike, test_size=0.2)

# STEP 2.2 NORMALIZE TRAINING AND TESTING DATA INDIVIDUALLY
# https://stackoverflow.com/questions/49444262/normalize-data-before-or-after-split-of-training-and-testing-data

bool_MEAN_NORMALIZATION = False # This makes it so that the data is normalized by mean value
bool_MIN_MAX_NORMALIZATION = True # This makes it so that the data is normalized by min-max values

if (bool_MEAN_NORMALIZATION == True):
    print("Data is normalized by the use of Mean Normalization!")
    X_train=(X_train-X_train.mean())/X_train.std()
    X_test=(X_test-X_test.mean())/X_test.std()
elif (bool_MIN_MAX_NORMALIZATION == True):
    print("Data is normalized by the use of Min-Max Normalization!")
    X_train=(X_train-X_train.min())/(X_train.max()-X_train.min())
    X_test=(X_test-X_test.min())/(X_test.max()-X_test.min())
else:
    print("Data is not normalized! Warning! If intentional, proceed forward!")

# STEP 2.3: Print Split Data
# Print all the details of the new datasets formed
print("\nSize of X_train:",len(X_train), "\tPercentage:", round( len(X_train)/len(independent_seoulbike), 2)  )
print("Size of X_test :" ,len(X_test) ,"\tPercentage:", round( len(X_test)/len(independent_seoulbike), 2)  )
print("Size of total independent_seoulbike(X):",len(independent_seoulbike))
print("")
print("Size of y_train:",len(y_train), "\tPercentage:", round( len(y_train)/len(dependent_seoulbike), 2)  )
print("Size of y_test :" ,len(y_test) ,"\tPercentage:", round( len(y_test)/len(dependent_seoulbike), 2)  )
print("Size of total dependent_seoulbike(Y):",len(dependent_seoulbike))

print("\nThe data is split close to how we want it. Close to 80% for Training, and 20% for Testing\n")

print("Sample data of first 3 rows of TRAINING DATASET. Just a preview of how it looks:")
print('=================================')
print("First 3 samples of X_train:\n",X_train.head(3))
print("\nFirst 3 samples of y_train:\n",y_train.head(3))
print('=================================\n')
print("Sample data of first 3 rows of TESTING DATASET. Just a preview of how it looks:")
print('=================================')
print("First 3 samples of X_test:\n",X_test.head(3))
print("\nFirst 3 samples of y_test:\n",y_test.head(3))
print('=================================')

Data is normalized by the use of Min-Max Normalization!

Size of X_train: 6772 	Percentage: 0.8
Size of X_test : 1693 	Percentage: 0.2
Size of total independent_seoulbike(X): 8465

Size of y_train: 6772 	Percentage: 0.8
Size of y_test : 1693 	Percentage: 0.2
Size of total dependent_seoulbike(Y): 8465

The data is split close to how we want it. Close to 80% for Training, and 20% for Testing

Sample data of first 3 rows of TRAINING DATASET. Just a preview of how it looks:
First 3 samples of X_train:
        Seasons  Dew point temperature(°C)  Temperature(°C)  Humidity(%)  \
5564  0.666667                   0.872822         0.857394     0.520408   
1467  1.000000                   0.451220         0.274648     0.846939   
2229  0.333333                   0.547038         0.494718     0.530612   

      Rainfall(mm)  Visibility (10m)  Solar Radiation (MJ/m2)      Hour  \
5564           0.0          1.000000                 0.019886  0.869565   
1467           0.0          0.481500         

# STEP 3: Train Regression Models

### Step 3.1 Linear Regression

In [24]:
print_switch = True
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

# Unbagged Linear Regressor
Linear_Regressor_Predictions = np.abs(get_Linear_Reg_Predictions(X_train, X_test, y_train, False))

# Bagged Linear Regressor
Bagged_Linear_Regressor_Predictions = np.abs(get_Linear_Reg_Predictions(X_train, X_test, y_train, True))

if (print_switch):
    print("Linear_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",Linear_Regressor_Predictions[i][0])
    print("")
    print("Bagged Linear_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",Bagged_Linear_Regressor_Predictions[i])

MSE_Linear_Regressor = mean_squared_error(y_true=y_test, y_pred=Linear_Regressor_Predictions)
print("Mean Squared Error of Linear Regressor:",MSE_Linear_Regressor)
Bagged_MSE_Linear_Regressor = mean_squared_error(y_true=y_test, y_pred=Bagged_Linear_Regressor_Predictions)
print("Mean Squared Error of Bagged Linear Regressor:",Bagged_MSE_Linear_Regressor)

Linear_Regressor_Predictions
Index	Prediction
0 	 1289.6846834242515
1 	 616.1775829545924
2 	 1230.2978020600744
3 	 663.8826135140354
4 	 724.3531577223788

Bagged Linear_Regressor_Predictions
Index	Prediction
0 	 1287.4660777725735
1 	 606.8502894906035
2 	 1215.290231807764
3 	 664.7950408909444
4 	 719.616751090122
Mean Squared Error of Linear Regressor: 187350.9506241019
Mean Squared Error of Bagged Linear Regressor: 188696.30419694906


### Step 3.2 Decision Tree Regressor

In [25]:
print_switch = True

# Get Paramters using GridSearchCV - Use this for Decision Tree and Random Forests
max_leaf_nodes_out, max_depth_out = get_tree_params(X_train, X_test, y_train)
if (print_switch):
    print("GridSearchCV Parameter Estimations using Decision Classifier")
    print("This code takes a lot of time, please wait...")
    print("\tmax_leaf_nodes_out:",max_leaf_nodes_out)
    print("\tmax_depth_out:",max_depth_out)
    print("")

# https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html

# Decision Tree Regressor
Decision_Tree_Regressor_Predictions = np.abs(get_Decision_Tree_Predictions(X_train, X_test, y_train, max_leaf_nodes_out, max_depth_out, False))

# Bagged Decision Tree Regressor
Bagged_Decision_Tree_Regressor_Predictions = np.abs(get_Decision_Tree_Predictions(X_train, X_test, y_train, max_leaf_nodes_out, max_depth_out, True))

if (print_switch):
    print("Decision_Tree_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",Decision_Tree_Regressor_Predictions[i])
    print("")
    print("Bagged Decision_Tree_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",Bagged_Decision_Tree_Regressor_Predictions[i])
        
MSE_Decision_Tree_Regressor = mean_squared_error(y_true=y_test, y_pred=Decision_Tree_Regressor_Predictions)
print("Mean Squared Error of Decision Tree Regressor:",MSE_Decision_Tree_Regressor)
Bagged_MSE_Decision_Tree_Regressor = mean_squared_error(y_true=y_test, y_pred=Bagged_Decision_Tree_Regressor_Predictions)
print("Mean Squared Error of Bagged Decision Tree Regressor:",Bagged_MSE_Decision_Tree_Regressor)

Fitting 3 folds for each of 9604 candidates, totalling 28812 fits
GridSearchCV Parameter Estimations using Decision Classifier
This code takes a lot of time, please wait...
	max_leaf_nodes_out: 80
	max_depth_out: 10

Decision_Tree_Regressor_Predictions
Index	Prediction
0 	 1428.1785714285713
1 	 255.18199608610567
2 	 1480.4202898550725
3 	 601.8720930232558
4 	 977.1967213114754

Bagged Decision_Tree_Regressor_Predictions
Index	Prediction
0 	 1461.0769604487741
1 	 269.5003038045108
2 	 1691.8685188820546
3 	 635.849009534161
4 	 975.2468103562971
Mean Squared Error of Decision Tree Regressor: 84448.63677685935
Mean Squared Error of Bagged Decision Tree Regressor: 69566.01340604988


### Step 3.3 Random Forests Regressor

In [26]:
print_switch = True
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

# Random Forest Regressor
Random_Forest_Regressor_Predictions = np.abs(get_Random_Forests_Predictions(X_train, X_test, y_train, max_leaf_nodes_out, max_depth_out, False))

# Bagged Random Forest Regressor
Bagged_Random_Forest_Regressor_Predictions = np.abs(get_Random_Forests_Predictions(X_train, X_test, y_train, max_leaf_nodes_out, max_depth_out, True))

if (print_switch):
    print("Using same parameters as DecisionTreeRegressor\n")
    print("Random_Forest_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",Random_Forest_Regressor_Predictions[i])
    print("")
    print("Bagged Random_Forest_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",Bagged_Random_Forest_Regressor_Predictions[i])

MSE_Random_Forest_Regressor = mean_squared_error(y_true=y_test, y_pred=Random_Forest_Regressor_Predictions)
print("Mean Squared Error of Random Forest Regressor:",MSE_Random_Forest_Regressor)
Bagged_MSE_Random_Forest_Regressor = mean_squared_error(y_true=y_test, y_pred=Bagged_Random_Forest_Regressor_Predictions)
print("Mean Squared Error of Bagged Random Forest Regressor:",Bagged_MSE_Random_Forest_Regressor)

Using same parameters as DecisionTreeRegressor

Random_Forest_Regressor_Predictions
Index	Prediction
0 	 1396.5332365061302
1 	 263.16010439237004
2 	 1630.5524379275757
3 	 610.5209257124841
4 	 985.0278434191003

Bagged Random_Forest_Regressor_Predictions
Index	Prediction
0 	 1380.1201406328917
1 	 274.29637358431563
2 	 1637.7167116698706
3 	 591.0101322849046
4 	 996.6043666642884
Mean Squared Error of Random Forest Regressor: 68243.9963392076
Mean Squared Error of Bagged Random Forest Regressor: 65897.9893130154


### Step 3.4 Artificial Neural Networks Regressor

In [27]:
print_switch = True
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

# MLP Regressor
MLP_Regressor_Predictions = np.abs(get_ANN_Predictions(10, X_train, X_test, y_train, False))

# Bagged MLP Regressor
Bagged_MLP_Regressor_Predictions = np.abs(get_ANN_Predictions(10, X_train, X_test, y_train, True))

if (print_switch):
    print("MLP_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",MLP_Regressor_Predictions[i])
    print("")
    print("Bagged MLP_Regressor_Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",Bagged_MLP_Regressor_Predictions[i])

MSE_MLP_Regressor = mean_squared_error(y_true=y_test, y_pred=MLP_Regressor_Predictions)
print("Mean Squared Error of MLP Regressor:",MSE_MLP_Regressor)
Bagged_MSE_MLP_Regressor = mean_squared_error(y_true=y_test, y_pred=Bagged_MLP_Regressor_Predictions)
print("Mean Squared Error of Bagged MLP Regressor:",Bagged_MSE_MLP_Regressor)

MLP_Regressor_Predictions
Index	Prediction
0 	 1313.0839519667709
1 	 313.28750690177264
2 	 1447.4840384113518
3 	 569.8426484804546
4 	 816.1318674861949

Bagged MLP_Regressor_Predictions
Index	Prediction
0 	 1307.8367983533167
1 	 318.5160700771663
2 	 1444.321541725831
3 	 554.267114627613
4 	 842.3907314695264
Mean Squared Error of MLP Regressor: 161488.62129703877
Mean Squared Error of Bagged MLP Regressor: 160252.64662896984


# STEP 4: Combine Regression Models with Ensemble Learning

In [28]:
print_switch = True

# Bagging/Pasting - Implement in future
# https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/

print("Regular MSE:")
print("Mean Squared Error of Linear Regressor:",MSE_Linear_Regressor)
print("Mean Squared Error of Decision Tree Regressor:",MSE_Decision_Tree_Regressor)
print("Mean Squared Error of Random Forest Regressor:",MSE_Random_Forest_Regressor)
print("Mean Squared Error of MLP Regressor:",MSE_MLP_Regressor)
print("Bagged MSE:")
print("Mean Squared Error of Bagged Linear Regressor:",Bagged_MSE_Linear_Regressor)
print("Mean Squared Error of Bagged Decision Tree Regressor:",Bagged_MSE_Decision_Tree_Regressor)
print("Mean Squared Error of Bagged Random Forest Regressor:",Bagged_MSE_Random_Forest_Regressor)
print("Mean Squared Error of Bagged MLP Regressor:",Bagged_MSE_MLP_Regressor)
print("\n")
Total_Number_of_Regressors = 4 # useful for calculating averages

# Ensemble Fusion Methods

# Fusion - Average of Regressions - Regular
average_regression = np.zeros(y_test.size)
for i in range (y_test.size):
    lin_r = Linear_Regressor_Predictions[i][0]
    tree_r = Decision_Tree_Regressor_Predictions[i]
    rand_tree_r = Random_Forest_Regressor_Predictions[i]
    mlp_r = MLP_Regressor_Predictions[i]
    
    sum_reg =  (lin_r + tree_r + rand_tree_r + mlp_r)
    average_regression[i] = round(sum_reg/Total_Number_of_Regressors)

# Fusion - Average of Regressions - Bagged
bagged_average_regression = np.zeros(y_test.size)
for i in range (y_test.size):
    lin_r = Bagged_Linear_Regressor_Predictions[i]
    tree_r = Bagged_Decision_Tree_Regressor_Predictions[i]
    rand_tree_r = Bagged_Random_Forest_Regressor_Predictions[i]
    mlp_r = Bagged_MLP_Regressor_Predictions[i]
    
    sum_reg =  (lin_r + tree_r + rand_tree_r + mlp_r)
    bagged_average_regression[i] = round(sum_reg/Total_Number_of_Regressors)

if (print_switch):
    print("Fusion: Average of Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",average_regression[i])
    print("")
    print("Fusion: Bagged Average of Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",bagged_average_regression[i])
        
print("")
MSE_average_reg = mean_squared_error(y_true=y_test, y_pred=average_regression)
print("Mean Squared Error of Averaged Regression:",round(MSE_average_reg))
Bagged_MSE_average_reg = mean_squared_error(y_true=y_test, y_pred=bagged_average_regression)
print("Mean Squared Error of Bagged Averaged Regression:", round(Bagged_MSE_average_reg))
print("")

# Accuracy Report
# https://subscription.packtpub.com/book/data/9781789808452/1/ch01lvl1sec12/computing-regression-accuracy
#average_reg_accuracy_score = accuracy_score(y_true=y_test, y_pred=average_regression)
average_reg_accuracy_score = sm.r2_score(y_true=y_test, y_pred=average_regression)
print("Regression Score of Averaged Regression:", round(average_reg_accuracy_score*100, 2),"%")
#bagged_average_reg_accuracy_score = accuracy_score(y_true=y_test, y_pred=bagged_average_regression)
bagged_average_reg_accuracy_score = sm.r2_score(y_true=y_test, y_pred=bagged_average_regression)
print("Regression Score of Bagged Averaged Regression:", round(bagged_average_reg_accuracy_score*100, 2),"%")

Regular MSE:
Mean Squared Error of Linear Regressor: 187350.9506241019
Mean Squared Error of Decision Tree Regressor: 84448.63677685935
Mean Squared Error of Random Forest Regressor: 68243.9963392076
Mean Squared Error of MLP Regressor: 161488.62129703877
Bagged MSE:
Mean Squared Error of Bagged Linear Regressor: 188696.30419694906
Mean Squared Error of Bagged Decision Tree Regressor: 69566.01340604988
Mean Squared Error of Bagged Random Forest Regressor: 65897.9893130154
Mean Squared Error of Bagged MLP Regressor: 160252.64662896984


Fusion: Average of Predictions
Index	Prediction
0 	 1357.0
1 	 362.0
2 	 1447.0
3 	 612.0
4 	 876.0

Fusion: Bagged Average of Predictions
Index	Prediction
0 	 1359.0
1 	 367.0
2 	 1497.0
3 	 611.0
4 	 883.0

Mean Squared Error of Averaged Regression: 95568
Mean Squared Error of Bagged Averaged Regression: 93729

Regression Score of Averaged Regression: 77.01 %
Regression Score of Bagged Averaged Regression: 77.45 %


# STEP 5: Post-Ensemble Analysis + Tweaking

In [32]:
# See how the model performs without Linear Regression. Keep Bagging
print_switch = True

print("Bagged MSE without Linear Regressor:")
print("Mean Squared Error of Bagged Decision Tree Regressor:",Bagged_MSE_Decision_Tree_Regressor)
print("Mean Squared Error of Bagged Random Forest Regressor:",Bagged_MSE_Random_Forest_Regressor)
print("Mean Squared Error of Bagged MLP Regressor:",Bagged_MSE_MLP_Regressor)
print("\n")
Total_Number_of_Regressors = 3 # useful for calculating averages

# Fusion - Average of Regressions - Bagged
bagged_average_regression = np.zeros(y_test.size)
for i in range (y_test.size):
    tree_r = Bagged_Decision_Tree_Regressor_Predictions[i]
    rand_tree_r = Bagged_Random_Forest_Regressor_Predictions[i]
    mlp_r = Bagged_MLP_Regressor_Predictions[i]
    
    sum_reg =  (tree_r + rand_tree_r + mlp_r)
    bagged_average_regression[i] = round(sum_reg/Total_Number_of_Regressors)

if (print_switch):
    print("Fusion: Bagged Average of Predictions")
    print("Index\tPrediction")
    for i in range (5):
        print(i,"\t",bagged_average_regression[i])

print("")
print("Bagged Averaged Regression without Linear Regression:")
Bagged_MSE_average_reg = mean_squared_error(y_true=y_test, y_pred=bagged_average_regression)
print("\tMean Squared Error:",round(Bagged_MSE_average_reg))
bagged_average_reg_accuracy_score = sm.r2_score(y_true=y_test, y_pred=bagged_average_regression)
print("\tRegression Score:", round(bagged_average_reg_accuracy_score*100, 2),"%")

Bagged MSE without Linear Regressor:
Mean Squared Error of Bagged Decision Tree Regressor: 69566.01340604988
Mean Squared Error of Bagged Random Forest Regressor: 65897.9893130154
Mean Squared Error of Bagged MLP Regressor: 160252.64662896984


Fusion: Bagged Average of Predictions
Index	Prediction
0 	 1383.0
1 	 287.0
2 	 1591.0
3 	 594.0
4 	 938.0

Bagged Averaged Regression without Linear Regression:
	Mean Squared Error: 79176
	Regression Score: 80.95 %


# EXTRA: Potential Exploration Code

In [30]:
# Add SARIMAX testing?
# add other extra tests?