In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import statsmodels.api as sm

In [2]:
def meanSquared(y_test, yPredict):
    meanSquared = mean_squared_error(y_test, yPredict)
    print('Mean Squared Result: ', meanSquared)


def meanAbsolute(y_test, yPredict):
    meanAbsolute = mean_absolute_error(y_test, yPredict)
    print('Mean Absolute Result: ', meanAbsolute)

In [3]:
projectDataset = pd.read_excel('ProjectDataset.xlsx')

In [4]:
projectDataset = projectDataset.drop(labels='SampleNo',axis = 1)

In [5]:
allXValues = projectDataset[["x1","x2","x3","x4","x5","x6"]].values
yValues = projectDataset["Y"].values
xValues = allXValues[:99,:]
yValues = yValues[:99]
last20Values = allXValues[100:,:]

In [6]:
x_train,x_test,y_train,y_test = train_test_split(xValues,yValues,test_size = 0.2,random_state = 0)

In [7]:
linearRegression = LinearRegression()
linearRegressionPredict = linearRegression.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,linearRegressionPredict)
meanAbsolute(y_test,linearRegressionPredict)

Mean Squared Result:  3762761.2644798397
Mean Absolute Result:  1598.690644938068


In [8]:
linearRegressionModel = sm.OLS(linearRegressionPredict,x_test)
print(linearRegressionModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.907
Model:                            OLS   Adj. R-squared (uncentered):              0.875
Method:                 Least Squares   F-statistic:                              29.12
Date:                Wed, 23 Jun 2021   Prob (F-statistic):                    3.24e-07
Time:                        18:06:18   Log-Likelihood:                         -156.16
No. Observations:                  20   AIC:                                      322.3
Df Residuals:                      15   BIC:                                      327.3
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
gradientBoosted = XGBClassifier()
gradientBoostedPredict = gradientBoosted.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,gradientBoostedPredict)
meanAbsolute(y_test,gradientBoostedPredict)





Mean Squared Result:  462689.43220064987
Mean Absolute Result:  314.32104999999996


In [10]:
gradientBoostedModel = sm.OLS(gradientBoostedPredict,x_test)
print(gradientBoostedModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.397
Model:                            OLS   Adj. R-squared (uncentered):              0.196
Method:                 Least Squares   F-statistic:                              1.974
Date:                Wed, 23 Jun 2021   Prob (F-statistic):                       0.141
Time:                        18:06:19   Log-Likelihood:                         -135.34
No. Observations:                  20   AIC:                                      280.7
Df Residuals:                      15   BIC:                                      285.7
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [11]:
decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,decisionTreePredict)
meanAbsolute(y_test,decisionTreePredict)

Mean Squared Result:  702268.992579
Mean Absolute Result:  520.7939


In [12]:
decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.865
Model:                            OLS   Adj. R-squared (uncentered):              0.820
Method:                 Least Squares   F-statistic:                              19.20
Date:                Wed, 23 Jun 2021   Prob (F-statistic):                    4.87e-06
Time:                        18:06:19   Log-Likelihood:                         -147.44
No. Observations:                  20   AIC:                                      304.9
Df Residuals:                      15   BIC:                                      309.9
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [13]:
randomForest = RandomForestRegressor(n_estimators=20,random_state=0)

randomForestPredict = randomForest.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,randomForestPredict)
meanAbsolute(y_test,randomForestPredict)

Mean Squared Result:  747506.1538929332
Mean Absolute Result:  602.4220274999999


In [14]:
randomForestModel = sm.OLS(randomForestPredict,x_test)
print(randomForestModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.862
Model:                            OLS   Adj. R-squared (uncentered):              0.816
Method:                 Least Squares   F-statistic:                              18.71
Date:                Wed, 23 Jun 2021   Prob (F-statistic):                    5.73e-06
Time:                        18:06:19   Log-Likelihood:                         -148.39
No. Observations:                  20   AIC:                                      306.8
Df Residuals:                      15   BIC:                                      311.8
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [15]:
kNeighbours = KNeighborsRegressor()

kNeighboursPredict = kNeighbours.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,kNeighboursPredict)
meanAbsolute(y_test,kNeighboursPredict)

Mean Squared Result:  1932350.24059084
Mean Absolute Result:  1153.9941000000001


In [16]:
kNeighboursModel = sm.OLS(kNeighboursPredict,x_test)
print(kNeighboursModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.857
Model:                            OLS   Adj. R-squared (uncentered):              0.810
Method:                 Least Squares   F-statistic:                              18.02
Date:                Wed, 23 Jun 2021   Prob (F-statistic):                    7.25e-06
Time:                        18:06:20   Log-Likelihood:                         -156.51
No. Observations:                  20   AIC:                                      323.0
Df Residuals:                      15   BIC:                                      328.0
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [17]:
modelResult = dict({"linearRegression":linearRegressionModel.fit().rsquared,"gradientBoosted":gradientBoostedModel.fit().rsquared,
                  "decisionTree":decisionTreeModel.fit().rsquared,"randomForest":randomForestModel.fit().rsquared,"kNeighbours":kNeighboursModel.fit().rsquared})

In [18]:
def prediction(modelResult):
    modelResult1 = sorted(modelResult.values())
    sortedDict = {}

    for i in modelResult1:
        for k in modelResult.keys():
            if modelResult[k] == i:
                sortedDict[k] = modelResult[k]
                break

    print("Sorted Values \n",sortedDict)

    predictModel = max(sortedDict,key = sortedDict.get)

    if predictModel == 'linearRegression':
        result = linearRegression.predict(last20Values)
        print("Linear Regression Predict \n",linearRegression.predict(last20Values))
    elif predictModel == 'gradientBoosted':
        result = gradientBoosted.predict(last20Values)
        print("Gradient Boosted Predict \n",gradientBoosted.predict(last20Values))
    elif predictModel == 'decisionTree':
        result = decisionTree.predict(last20Values)
        print("Decision Tree Predict \n",decisionTree.predict(last20Values))
    elif predictModel == 'randomForest':
        result = randomForest.predict(last20Values)
        print("Random Forest Predict \n",randomForest.predict(last20Values))
    elif predictModel == 'kNeighbours':
        result = kNeighbours.predict(last20Values)
        print("K-Neighbours Predict \n",kNeighbours.predict(last20Values))
    
    excelResult = pd.DataFrame(data=result)
    writer = pd.ExcelWriter("PredictResult.xlsx")
    excelResult.to_excel(writer)
    writer.save()