In [1]:
#Import the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import statsmodels.api as sm

In [2]:
# meanSquared is a metric that to reach machine learning models result. 
# Mean Squared Error: It is a result like (actual value - predict value)^2
def meanSquared(y_test, yPredict):
    meanSquared = mean_squared_error(y_test, yPredict)
    print('Mean Squared Result: ', meanSquared)

# meanAbsolute is a metric that to reach machine learning models result.
# Mean Absolute Error: It is a result like actual value - predict value
def meanAbsolute(y_test, yPredict):
    meanAbsolute = mean_absolute_error(y_test, yPredict)
    print('Mean Absolute Result: ', meanAbsolute)

In [43]:
# Invoking dataset with pandas library in here
projectDataset = pd.read_excel('ProjectDataset.xlsx')

In [44]:
projectDataset.head()

Unnamed: 0,SampleNo,x1,x2,x3,x4,x5,x6,Y
0,1,19,30,9,24,5,2,1415.924
1,2,33,50,6,53,20,63,-40.44
2,3,22,49,0,16,-6,53,42.548
3,4,38,6,22,50,12,70,-23.64
4,5,11,40,28,-9,-20,98,1233.837


In [5]:
# SampleNo is an index of the dataset's rows so we can drop it.
projectDataset = projectDataset.drop(labels='SampleNo',axis = 1)

In [6]:
# x1,x2,x3,x4,x5,x6 is a features and Y is a label in this algorithm so we can split it this values for model.
allXValues = projectDataset[['x1','x2','x3','x4','x5','x6']].values
yValues = projectDataset['Y'].values
xValues = allXValues[:99,:]
yValues = yValues[:99]

In [7]:
# train_test_split is a function that split our features and labels, in here we split datas as %20 test and %80 train.
x_train,x_test,y_train,y_test = train_test_split(xValues,yValues,test_size = 0.2,random_state = 0)

In [8]:
# OLS is a function for checking model's fitting.
# In here we can say that our x4 values's P values is 0.818 so the rule is if variable's P-value is higher than 0.05 than we
# should remove it and fit the model again.
model = sm.OLS(y_train,x_train).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.755
Model:                            OLS   Adj. R-squared (uncentered):              0.739
Method:                 Least Squares   F-statistic:                              45.69
Date:                Sun, 29 May 2022   Prob (F-statistic):                    2.83e-21
Time:                        22:56:24   Log-Likelihood:                         -713.06
No. Observations:                  79   AIC:                                      1436.
Df Residuals:                      74   BIC:                                      1448.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
# For instance over there, we use "xValues[:,[0,1,2,4,5]]" for drop x4 values and resplit values after that, we can fit
# the model again
# As you can see in the new result x5 value's P-value is equal to 0.149 so we can remove this column again and re-run the model.
XValues = xValues[:,[0,1,2,4,5]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

model = sm.OLS(y_train,x_train).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.755
Model:                            OLS   Adj. R-squared (uncentered):              0.739
Method:                 Least Squares   F-statistic:                              45.69
Date:                Sun, 29 May 2022   Prob (F-statistic):                    2.83e-21
Time:                        22:56:24   Log-Likelihood:                         -713.06
No. Observations:                  79   AIC:                                      1436.
Df Residuals:                      74   BIC:                                      1448.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [10]:
# In this new model result our all P-value are 0 and our model's R-squared is 0.75 so we can say that our model is working good.
XValues = xValues[:,[0,1,2,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

model = sm.OLS(y_train,x_train).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.748
Model:                            OLS   Adj. R-squared (uncentered):              0.735
Method:                 Least Squares   F-statistic:                              55.74
Date:                Sun, 29 May 2022   Prob (F-statistic):                    9.94e-22
Time:                        22:56:24   Log-Likelihood:                         -714.18
No. Observations:                  79   AIC:                                      1436.
Df Residuals:                      75   BIC:                                      1446.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [11]:
# Let us use another machine learning tool which is Linear Regression, in fact linear regression really popular regression model
# in supervised machine learning area.
linearRegression = LinearRegression()
linearRegressionPredict = linearRegression.fit(x_train,y_train).predict(x_test)

In [12]:
# In our model's P-value are looking good and our R-sqaured value is approximately 0.84 so this model is predicting better than
# before one.
linearRegressionModel = sm.OLS(linearRegressionPredict,x_test)
print(linearRegressionModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.839
Model:                            OLS   Adj. R-squared (uncentered):              0.799
Method:                 Least Squares   F-statistic:                              20.92
Date:                Sun, 29 May 2022   Prob (F-statistic):                    3.40e-06
Time:                        22:56:24   Log-Likelihood:                         -161.54
No. Observations:                  20   AIC:                                      331.1
Df Residuals:                      16   BIC:                                      335.1
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [13]:
meanSquared(y_test,linearRegressionPredict)
meanAbsolute(y_test,linearRegressionPredict)

Mean Squared Result:  3727028.1165636294
Mean Absolute Result:  1591.6301943835106


In [14]:
""" Another supervised learning algorithm is gradient boosting, gradient boosting is one of the most popular machine learning 
algorithms for tabular datasets. It is powerful enough to find any nonlinear relationship between your model target and
features and has great usability that can deal with missing values, outliers, and high cardinality categorical values
on your features without any special treatment."""
x_train,x_test,y_train,y_test = train_test_split(xValues,yValues,test_size = 0.2,random_state = 0)

gradientBoosted = XGBClassifier()
gradientBoostedPredict = gradientBoosted.fit(x_train,y_train).predict(x_test)





In [15]:
gradientBoostedModel = sm.OLS(gradientBoostedPredict,x_test)
print(gradientBoostedModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.397
Model:                            OLS   Adj. R-squared (uncentered):              0.196
Method:                 Least Squares   F-statistic:                              1.974
Date:                Sun, 29 May 2022   Prob (F-statistic):                       0.141
Time:                        22:56:26   Log-Likelihood:                         -135.34
No. Observations:                  20   AIC:                                      280.7
Df Residuals:                      15   BIC:                                      285.7
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [16]:
XValues = xValues[:,[1,2,3,4,5]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

gradientBoosted = XGBClassifier()
gradientBoostedPredict = gradientBoosted.fit(x_train,y_train).predict(x_test)

model = sm.OLS(y_train,x_train).fit()
print(model.summary())





                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.755
Model:                            OLS   Adj. R-squared (uncentered):              0.739
Method:                 Least Squares   F-statistic:                              45.69
Date:                Sun, 29 May 2022   Prob (F-statistic):                    2.83e-21
Time:                        22:56:27   Log-Likelihood:                         -713.06
No. Observations:                  79   AIC:                                      1436.
Df Residuals:                      74   BIC:                                      1448.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [17]:
# In here when we drop some columns and rerun it, we can reach the 0.75 R-squared result in our algorithm.
XValues = xValues[:,[1,2,3,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

gradientBoosted = XGBClassifier()
gradientBoostedPredict = gradientBoosted.fit(x_train,y_train).predict(x_test)

model = sm.OLS(y_train,x_train).fit()
print(model.summary())





                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.748
Model:                            OLS   Adj. R-squared (uncentered):              0.735
Method:                 Least Squares   F-statistic:                              55.74
Date:                Sun, 29 May 2022   Prob (F-statistic):                    9.94e-22
Time:                        22:56:27   Log-Likelihood:                         -714.18
No. Observations:                  79   AIC:                                      1436.
Df Residuals:                      75   BIC:                                      1446.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [18]:
meanSquared(y_test,gradientBoostedPredict)
meanAbsolute(y_test,gradientBoostedPredict)

Mean Squared Result:  564797.34881235
Mean Absolute Result:  399.00464999999997


In [19]:
# Decision tree based on working with binary trees, this is really good and easy to understand when we use regression algorithms
x_train,x_test,y_train,y_test = train_test_split(xValues,yValues,test_size = 0.2,random_state = 0)

decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

In [20]:
decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.865
Model:                            OLS   Adj. R-squared (uncentered):              0.820
Method:                 Least Squares   F-statistic:                              19.20
Date:                Sun, 29 May 2022   Prob (F-statistic):                    4.87e-06
Time:                        22:56:27   Log-Likelihood:                         -147.44
No. Observations:                  20   AIC:                                      304.9
Df Residuals:                      15   BIC:                                      309.9
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [21]:
XValues = xValues[:,[0,1,2,4,5]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.639
Model:                            OLS   Adj. R-squared (uncentered):              0.519
Method:                 Least Squares   F-statistic:                              5.309
Date:                Sun, 29 May 2022   Prob (F-statistic):                     0.00526
Time:                        22:56:27   Log-Likelihood:                         -156.64
No. Observations:                  20   AIC:                                      323.3
Df Residuals:                      15   BIC:                                      328.3
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [22]:
XValues = xValues[:,[0,2,4,5]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.505
Model:                            OLS   Adj. R-squared (uncentered):              0.382
Method:                 Least Squares   F-statistic:                              4.087
Date:                Sun, 29 May 2022   Prob (F-statistic):                      0.0181
Time:                        22:56:27   Log-Likelihood:                         -152.20
No. Observations:                  20   AIC:                                      312.4
Df Residuals:                      16   BIC:                                      316.4
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [23]:
XValues = xValues[:,[0,2,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.416
Model:                            OLS   Adj. R-squared (uncentered):              0.312
Method:                 Least Squares   F-statistic:                              4.029
Date:                Sun, 29 May 2022   Prob (F-statistic):                      0.0246
Time:                        22:56:27   Log-Likelihood:                         -152.13
No. Observations:                  20   AIC:                                      310.3
Df Residuals:                      17   BIC:                                      313.2
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [24]:
XValues = xValues[:,[0,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.638
Model:                            OLS   Adj. R-squared (uncentered):              0.597
Method:                 Least Squares   F-statistic:                              15.84
Date:                Sun, 29 May 2022   Prob (F-statistic):                    0.000107
Time:                        22:56:27   Log-Likelihood:                         -178.83
No. Observations:                  20   AIC:                                      361.7
Df Residuals:                      18   BIC:                                      363.6
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [25]:
# Here is only value for fitting decision tree algorithm and we have 0.76 model score about R-squared.
XValues = xValues[:,[0]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.764
Model:                            OLS   Adj. R-squared (uncentered):              0.752
Method:                 Least Squares   F-statistic:                              61.53
Date:                Sun, 29 May 2022   Prob (F-statistic):                    2.24e-07
Time:                        22:56:27   Log-Likelihood:                         -175.72
No. Observations:                  20   AIC:                                      353.4
Df Residuals:                      19   BIC:                                      354.4
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [26]:
meanSquared(y_test,decisionTreePredict)
meanAbsolute(y_test,decisionTreePredict)

Mean Squared Result:  10025627.023127582
Mean Absolute Result:  2033.4746125000001


In [27]:
# Before the last supervised learning algorithm is Random Forest algorithm, actually this algorithm basically using decision tree 
# structure and it is eager to use on tabular datasets.
x_train,x_test,y_train,y_test = train_test_split(xValues,yValues,test_size = 0.2,random_state = 0)

randomForest = RandomForestRegressor(n_estimators=20,random_state=0)

randomForestPredict = randomForest.fit(x_train,y_train).predict(x_test)

In [28]:
randomForestModel = sm.OLS(randomForestPredict,x_test)
print(randomForestModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.862
Model:                            OLS   Adj. R-squared (uncentered):              0.816
Method:                 Least Squares   F-statistic:                              18.71
Date:                Sun, 29 May 2022   Prob (F-statistic):                    5.73e-06
Time:                        22:56:27   Log-Likelihood:                         -148.39
No. Observations:                  20   AIC:                                      306.8
Df Residuals:                      15   BIC:                                      311.8
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [29]:
XValues = xValues[:,[0,1,2,3,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

randomForest = RandomForestRegressor(n_estimators=20,random_state=0)

randomForestPredict = randomForest.fit(x_train,y_train).predict(x_test)

randomForestModel = sm.OLS(randomForestPredict,x_test)
print(randomForestModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.862
Model:                            OLS   Adj. R-squared (uncentered):              0.827
Method:                 Least Squares   F-statistic:                              24.95
Date:                Sun, 29 May 2022   Prob (F-statistic):                    1.05e-06
Time:                        22:56:27   Log-Likelihood:                         -149.50
No. Observations:                  20   AIC:                                      307.0
Df Residuals:                      16   BIC:                                      311.0
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [30]:
XValues = xValues[:,[0,2,3,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

randomForest = RandomForestRegressor(n_estimators=20,random_state=0)

randomForestPredict = randomForest.fit(x_train,y_train).predict(x_test)

randomForestModel = sm.OLS(randomForestPredict,x_test)
print(randomForestModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.833
Model:                            OLS   Adj. R-squared (uncentered):              0.803
Method:                 Least Squares   F-statistic:                              28.17
Date:                Sun, 29 May 2022   Prob (F-statistic):                    8.02e-07
Time:                        22:56:28   Log-Likelihood:                         -149.98
No. Observations:                  20   AIC:                                      306.0
Df Residuals:                      17   BIC:                                      308.9
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [31]:
# In here we have kind a good model result which is 0.81 R-squared.
XValues = xValues[:,[0,2,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

randomForest = RandomForestRegressor(n_estimators=20,random_state=0)

randomForestPredict = randomForest.fit(x_train,y_train).predict(x_test)

randomForestModel = sm.OLS(randomForestPredict,x_test)
print(randomForestModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.811
Model:                            OLS   Adj. R-squared (uncentered):              0.777
Method:                 Least Squares   F-statistic:                              24.24
Date:                Sun, 29 May 2022   Prob (F-statistic):                    2.26e-06
Time:                        22:56:28   Log-Likelihood:                         -147.54
No. Observations:                  20   AIC:                                      301.1
Df Residuals:                      17   BIC:                                      304.1
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [32]:
meanSquared(y_test,randomForestPredict)
meanAbsolute(y_test,randomForestPredict)

Mean Squared Result:  459457.1445903957
Mean Absolute Result:  444.1186424999999


In [33]:
# This algorithm is the last machine learning algorithm in that project which is, K-NN algorithm stores all 
# the available data and classifies a new data point based on the similarity. This means when new data appears then it can
# be easily classified into a well suite category by using K- NN algorithm.
x_train,x_test,y_train,y_test = train_test_split(xValues,yValues,test_size = 0.2,random_state = 0)

kNeighbours = KNeighborsRegressor()

kNeighboursPredict = kNeighbours.fit(x_train,y_train).predict(x_test)


In [34]:
kNeighboursModel = sm.OLS(kNeighboursPredict,x_test)
print(kNeighboursModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.857
Model:                            OLS   Adj. R-squared (uncentered):              0.810
Method:                 Least Squares   F-statistic:                              18.02
Date:                Sun, 29 May 2022   Prob (F-statistic):                    7.25e-06
Time:                        22:56:28   Log-Likelihood:                         -156.51
No. Observations:                  20   AIC:                                      323.0
Df Residuals:                      15   BIC:                                      328.0
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [35]:
XValues = xValues[:,[0,1,2,3,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

kNeighbours = KNeighborsRegressor()

kNeighboursPredict = kNeighbours.fit(x_train,y_train).predict(x_test)

kNeighboursModel = sm.OLS(kNeighboursPredict,x_test)
print(kNeighboursModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.871
Model:                            OLS   Adj. R-squared (uncentered):              0.838
Method:                 Least Squares   F-statistic:                              26.92
Date:                Sun, 29 May 2022   Prob (F-statistic):                    6.25e-07
Time:                        22:56:28   Log-Likelihood:                         -155.61
No. Observations:                  20   AIC:                                      319.2
Df Residuals:                      16   BIC:                                      323.2
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [36]:
XValues = xValues[:,[0,1,2,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

kNeighbours = KNeighborsRegressor()

kNeighboursPredict = kNeighbours.fit(x_train,y_train).predict(x_test)

kNeighboursModel = sm.OLS(kNeighboursPredict,x_test)
print(kNeighboursModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.885
Model:                            OLS   Adj. R-squared (uncentered):              0.856
Method:                 Least Squares   F-statistic:                              30.65
Date:                Sun, 29 May 2022   Prob (F-statistic):                    2.55e-07
Time:                        22:56:28   Log-Likelihood:                         -151.29
No. Observations:                  20   AIC:                                      310.6
Df Residuals:                      16   BIC:                                      314.6
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [37]:
# In here we have 3 column variables and our model R-squared value is 0.89 so it is the best algorithm for this dataset.
XValues = xValues[:,[0,2,4]]

x_train,x_test,y_train,y_test = train_test_split(XValues,yValues,test_size = 0.2,random_state = 0)

kNeighbours = KNeighborsRegressor()

kNeighboursPredict = kNeighbours.fit(x_train,y_train).predict(x_test)

kNeighboursModel = sm.OLS(kNeighboursPredict,x_test)
print(kNeighboursModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.888
Model:                            OLS   Adj. R-squared (uncentered):              0.868
Method:                 Least Squares   F-statistic:                              44.81
Date:                Sun, 29 May 2022   Prob (F-statistic):                    2.75e-08
Time:                        22:56:28   Log-Likelihood:                         -146.82
No. Observations:                  20   AIC:                                      299.6
Df Residuals:                      17   BIC:                                      302.6
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [38]:
meanSquared(y_test,kNeighboursPredict)
meanAbsolute(y_test,kNeighboursPredict)

Mean Squared Result:  799968.398061452
Mean Absolute Result:  672.3439799999999


In [39]:
modelResult = dict({"linearRegression":linearRegressionModel.fit().rsquared,"gradientBoosted":gradientBoostedModel.fit().rsquared,
                  "decisionTree":decisionTreeModel.fit().rsquared,"randomForest":randomForestModel.fit().rsquared,"kNeighbours":kNeighboursModel.fit().rsquared})

In [40]:
def prediction(modelResult):
    modelResult1 = sorted(modelResult.values())
    sortedDict = {}

    for i in modelResult1:
        for k in modelResult.keys():
            if modelResult[k] == i:
                sortedDict[k] = modelResult[k]
                break

    print("Sorted Values \n",sortedDict)

    predictModel = max(sortedDict,key = sortedDict.get)

    if predictModel == 'linearRegression':
        last20Values = allXValues[100:,[0,1,2,4]]
        result = linearRegression.predict(last20Values)
        print("Linear Regression Predict \n",linearRegression.predict(last20Values))
    elif predictModel == 'gradientBoosted':
        last20Values = allXValues[100:,[1,2,3,4]]
        result = gradientBoosted.predict(last20Values)
        print("Gradient Boosted Predict \n",gradientBoosted.predict(last20Values))
    elif predictModel == 'decisionTree':
        last20Values = allXValues[100:,[0]]
        result = decisionTree.predict(last20Values)
        print("Decision Tree Predict \n",decisionTree.predict(last20Values))
    elif predictModel == 'randomForest':
        last20Values = allXValues[100:,[0,2,4]]
        result = randomForest.predict(last20Values)
        print("Random Forest Predict \n",randomForest.predict(last20Values))
    elif predictModel == 'kNeighbours':
        last20Values = allXValues[100:,[0,2,4]]
        result = kNeighbours.predict(last20Values)
        print("K-Neighbours Predict \n",kNeighbours.predict(last20Values))
    
    excelResult = pd.DataFrame(data=result)
    writer = pd.ExcelWriter("PredictResult.xlsx")
    excelResult.to_excel(writer)
    writer.save()

In [41]:
prediction(modelResult)

Sorted Values 
 {'gradientBoosted': 0.39680689826025606, 'decisionTree': 0.764069512347632, 'randomForest': 0.8105345006015943, 'linearRegression': 0.8394995265227613, 'kNeighbours': 0.8877448395092622}
K-Neighbours Predict 
 [3739.1486 1871.3952  226.8818 2337.689   252.6452  482.6996 1562.8828
 1936.4234 2722.7184  558.678  6550.9192 4797.6016 1224.936  4736.6352
 7363.4032 1352.9968  616.1352 1587.5856 2094.9708  -26.2732]
