## Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import statsmodels.api as sm

## Definitions

In [2]:
def meanSquared(y_test, yPredict):
    meanSquared = mean_squared_error(y_test, yPredict)
    print('Mean Squared Result: ', meanSquared)


def meanAbsolute(y_test, yPredict):
    meanAbsolute = mean_absolute_error(y_test, yPredict)
    print('Mean Absolute Result: ', meanAbsolute)

## Project importing

In my dataset I have 120 rows and 8 columns, in the columns, I have 6 features(x1,x2,x3,x4,x5,x6) for training and testing and have 1 label which is Y and that one is for the result.
But the thing that I have only 100 Y result in my dataset as you see in the 5. cell, that is why I have to predict the last 20 Y values with the best machine learning algorithm.

In [3]:
projectDataset = pd.read_excel('ProjectDataset.xlsx')

In [4]:
projectDataset

Unnamed: 0,SampleNo,x1,x2,x3,x4,x5,x6,Y
0,1,19,30,9,24,5,2,1415.924
1,2,33,50,6,53,20,63,-40.440
2,3,22,49,0,16,-6,53,42.548
3,4,38,6,22,50,12,70,-23.640
4,5,11,40,28,-9,-20,98,1233.837
...,...,...,...,...,...,...,...,...
115,116,16,27,4,13,-3,59,
116,117,28,43,3,32,4,47,
117,118,24,32,7,17,-7,35,
118,119,14,7,15,0,-14,33,


In [5]:
projectDataset.tail(20)

Unnamed: 0,SampleNo,x1,x2,x3,x4,x5,x6,Y
100,101,31,27,16,13,-18,34,
101,102,39,8,1,23,-16,45,
102,103,3,10,31,14,11,62,
103,104,15,47,30,24,9,30,
104,105,0,9,21,5,5,71,
105,106,8,5,7,19,11,12,
106,107,28,32,24,43,15,92,
107,108,12,31,8,-7,-19,81,
108,109,20,27,28,31,11,71,
109,110,1,11,24,3,2,12,


## Preprocessing

In here I wanna drop SampleNo column because that column is storing only the index of the rows so it is not important for my goal.

In [6]:
projectDataset = projectDataset.drop(labels='SampleNo',axis = 1)

In [7]:
allXValues = projectDataset[["x1","x2","x3","x4","x5","x6"]].values
yValues = projectDataset["Y"].values
xValues = allXValues[:99,:]
yValues = yValues[:99]
last20Values = allXValues[100:,:]

## Train and test split

So I have declared my datas as xValues and yValues, now on I have to split the datas as x_train,x_test,y_train and y_test

In [8]:
x_train,x_test,y_train,y_test = train_test_split(xValues,yValues,test_size = 0.2,random_state = 0)

## First machine learning algorithm which is Linear Regression

In [9]:
linearRegression = LinearRegression()
linearRegressionPredict = linearRegression.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,linearRegressionPredict)
meanAbsolute(y_test,linearRegressionPredict)

Mean Squared Result:  3762761.264479832
Mean Absolute Result:  1598.690644938067


## Linear Regression OLS(Ordinary Least Squares) result

In [10]:
linearRegressionModel = sm.OLS(linearRegressionPredict,x_test)
print(linearRegressionModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.907
Model:                            OLS   Adj. R-squared (uncentered):              0.875
Method:                 Least Squares   F-statistic:                              29.12
Date:                Mon, 28 Feb 2022   Prob (F-statistic):                    3.24e-07
Time:                        23:22:31   Log-Likelihood:                         -156.16
No. Observations:                  20   AIC:                                      322.3
Df Residuals:                      15   BIC:                                      327.3
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Second machine learning algorithm

In [11]:
gradientBoosted = XGBClassifier()
gradientBoostedPredict = gradientBoosted.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,gradientBoostedPredict)
meanAbsolute(y_test,gradientBoostedPredict)





Mean Squared Result:  462689.43220064987
Mean Absolute Result:  314.32104999999996


## Gradient Boosted OLS result

In [12]:
gradientBoostedModel = sm.OLS(gradientBoostedPredict,x_test)
print(gradientBoostedModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.397
Model:                            OLS   Adj. R-squared (uncentered):              0.196
Method:                 Least Squares   F-statistic:                              1.974
Date:                Mon, 28 Feb 2022   Prob (F-statistic):                       0.141
Time:                        23:22:32   Log-Likelihood:                         -135.34
No. Observations:                  20   AIC:                                      280.7
Df Residuals:                      15   BIC:                                      285.7
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Third machine learning algorithm

In [13]:
decisionTree = DecisionTreeRegressor(random_state=0)
decisionTreePredict = decisionTree.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,decisionTreePredict)
meanAbsolute(y_test,decisionTreePredict)

Mean Squared Result:  702268.992579
Mean Absolute Result:  520.7939


## Decision Tree OLS result

In [14]:
decisionTreeModel = sm.OLS(decisionTreePredict,x_test)
print(decisionTreeModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.865
Model:                            OLS   Adj. R-squared (uncentered):              0.820
Method:                 Least Squares   F-statistic:                              19.20
Date:                Mon, 28 Feb 2022   Prob (F-statistic):                    4.87e-06
Time:                        23:22:32   Log-Likelihood:                         -147.44
No. Observations:                  20   AIC:                                      304.9
Df Residuals:                      15   BIC:                                      309.9
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Fourth machine learning algorithm

In [15]:
randomForest = RandomForestRegressor(n_estimators=20,random_state=0)

randomForestPredict = randomForest.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,randomForestPredict)
meanAbsolute(y_test,randomForestPredict)

Mean Squared Result:  747506.1538929332
Mean Absolute Result:  602.4220274999999


## Random Forest OLS result

In [16]:
randomForestModel = sm.OLS(randomForestPredict,x_test)
print(randomForestModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.862
Model:                            OLS   Adj. R-squared (uncentered):              0.816
Method:                 Least Squares   F-statistic:                              18.71
Date:                Mon, 28 Feb 2022   Prob (F-statistic):                    5.73e-06
Time:                        23:22:32   Log-Likelihood:                         -148.39
No. Observations:                  20   AIC:                                      306.8
Df Residuals:                      15   BIC:                                      311.8
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Fifth and the last machine learning algorithm

In [17]:
kNeighbors = KNeighborsRegressor()

kNeighborsPredict = kNeighbors.fit(x_train,y_train).predict(x_test)

meanSquared(y_test,kNeighborsPredict)
meanAbsolute(y_test,kNeighborsPredict)

Mean Squared Result:  1932350.24059084
Mean Absolute Result:  1153.9941000000001


## K-Nearest Neighbors OLS result

In [18]:
kNeighborsModel = sm.OLS(kNeighborsPredict,x_test)
print(kNeighborsModel.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.857
Model:                            OLS   Adj. R-squared (uncentered):              0.810
Method:                 Least Squares   F-statistic:                              18.02
Date:                Mon, 28 Feb 2022   Prob (F-statistic):                    7.25e-06
Time:                        23:22:32   Log-Likelihood:                         -156.51
No. Observations:                  20   AIC:                                      323.0
Df Residuals:                      15   BIC:                                      328.0
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Dictionary name for the algorithms

In [19]:
modelResult = dict({"linearRegression":linearRegressionModel.fit().rsquared,"gradientBoosted":gradientBoostedModel.fit().rsquared,
                  "decisionTree":decisionTreeModel.fit().rsquared,"randomForest":randomForestModel.fit().rsquared,"kNeighbors":kNeighborsModel.fit().rsquared})

## Definition for Best prediction

In here I am using definition for reaching the highest rsquared values of the algorithms and then I am using the predict method for predict the last 20 values in my dataset and then save it to the excel.

In [20]:
def prediction(modelResult):
    modelResult1 = sorted(modelResult.values())
    sortedDict = {}

    for i in modelResult1:
        for k in modelResult.keys():
            if modelResult[k] == i:
                sortedDict[k] = modelResult[k]
                break

    print("Sorted Values \n",sortedDict)

    predictModel = max(sortedDict,key = sortedDict.get)

    if predictModel == 'linearRegression':
        result = linearRegression.predict(last20Values)
        print("Linear Regression Predict \n",linearRegression.predict(last20Values))
    elif predictModel == 'gradientBoosted':
        result = gradientBoosted.predict(last20Values)
        print("Gradient Boosted Predict \n",gradientBoosted.predict(last20Values))
    elif predictModel == 'decisionTree':
        result = decisionTree.predict(last20Values)
        print("Decision Tree Predict \n",decisionTree.predict(last20Values))
    elif predictModel == 'randomForest':
        result = randomForest.predict(last20Values)
        print("Random Forest Predict \n",randomForest.predict(last20Values))
    elif predictModel == 'kNeighbors':
        result = kNeighbors.predict(last20Values)
        print("K-Neighbors Predict \n",kNeighbors.predict(last20Values))
    
    excelResult = pd.DataFrame(data=result)
    writer = pd.ExcelWriter("PredictResult.xlsx")
    excelResult.to_excel(writer)
    writer.save()