   <h1 align="center"> <i>Restaurant Revenue Prediction</i> </h1>

## Import required Libraries

In [1]:
import numpy as np
import pandas as pd
import pydot
import math
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pandas.api.types import CategoricalDtype

## Import train and test data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [3]:
test.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P28,P29,P30,P31,P32,P33,P34,P35,P36,P37
0,0,01/22/2011,Niğde,Other,FC,1,4.0,4.0,4.0,1,...,2.0,3.0,0,0,0,0,0,0,0,0
1,1,03/18/2011,Konya,Other,IL,3,4.0,4.0,4.0,2,...,1.0,3.0,0,0,0,0,0,0,0,0
2,2,10/30/2013,Ankara,Big Cities,FC,3,4.0,4.0,4.0,2,...,2.0,3.0,0,0,0,0,0,0,0,0
3,3,05/06/2013,Kocaeli,Other,IL,2,4.0,4.0,4.0,2,...,2.0,3.0,0,4,0,0,0,0,0,0
4,4,07/31/2013,Afyonkarahisar,Other,FC,2,4.0,4.0,4.0,1,...,5.0,3.0,0,0,0,0,0,0,0,0


## Check dimension and info of data sets

In [4]:
print(train.shape)
print(test.shape)

(137, 43)
(100000, 42)


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 43 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Id          137 non-null    int64  
 1   Open Date   137 non-null    object 
 2   City        137 non-null    object 
 3   City Group  137 non-null    object 
 4   Type        137 non-null    object 
 5   P1          137 non-null    int64  
 6   P2          137 non-null    float64
 7   P3          137 non-null    float64
 8   P4          137 non-null    float64
 9   P5          137 non-null    int64  
 10  P6          137 non-null    int64  
 11  P7          137 non-null    int64  
 12  P8          137 non-null    int64  
 13  P9          137 non-null    int64  
 14  P10         137 non-null    int64  
 15  P11         137 non-null    int64  
 16  P12         137 non-null    int64  
 17  P13         137 non-null    float64
 18  P14         137 non-null    int64  
 19  P15         137 non-null    i

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Id          100000 non-null  int64  
 1   Open Date   100000 non-null  object 
 2   City        100000 non-null  object 
 3   City Group  100000 non-null  object 
 4   Type        100000 non-null  object 
 5   P1          100000 non-null  int64  
 6   P2          100000 non-null  float64
 7   P3          100000 non-null  float64
 8   P4          100000 non-null  float64
 9   P5          100000 non-null  int64  
 10  P6          100000 non-null  int64  
 11  P7          100000 non-null  int64  
 12  P8          100000 non-null  int64  
 13  P9          100000 non-null  int64  
 14  P10         100000 non-null  int64  
 15  P11         100000 non-null  int64  
 16  P12         100000 non-null  int64  
 17  P13         100000 non-null  float64
 18  P14         100000 non-null  int64  
 19  P15

There are no null values in the datasets and there are three categorical variables

## Data preprocessing

#### Encoding categorical variables

City variable Encoding

In [7]:
train_city = train["City"].unique()
test_city = test["City"].unique()
all_city = set(train_city).union(test_city)
print(all_city)

{'Aksaray', 'Elazığ', 'Isparta', 'Zonguldak', 'Gaziantep', 'Mersin', 'Malatya', 'Balıkesir', 'Osmaniye', 'Bilecik', 'Sakarya', 'Kırıkkale', 'Erzincan', 'Bursa', 'Manisa', 'Tokat', 'Tanımsız', 'Batman', 'Ankara', 'Samsun', 'Tekirdağ', 'Ordu', 'Erzurum', 'Çanakkale', 'Trabzon', 'Antalya', 'Düzce', 'Çorum', 'Rize', 'Eskişehir', 'Kastamonu', 'Kayseri', 'Karabük', 'Sivas', 'Şanlıurfa', 'Mardin', 'İzmir', 'Kars', 'Kırşehir', 'Aydın', 'Bolu', 'Çankırı', 'Kocaeli', 'Kırklareli', 'Hatay', 'Muğla', 'Diyarbakır', 'Uşak', 'İstanbul', 'Niğde', 'Nevşehir', 'Adana', 'Kütahya', 'Konya', 'Denizli', 'Yalova', 'Artvin', 'Edirne', 'Kahramanmaraş', 'Giresun', 'Siirt', 'Amasya', 'Afyonkarahisar'}


In [8]:
city_dummies=pd.get_dummies(train['City'].astype(CategoricalDtype(all_city)))
train = pd.concat([train,city_dummies],axis=1)

city_dummies=pd.get_dummies(test['City'].astype(CategoricalDtype(all_city)))
test = pd.concat([test,city_dummies],axis=1)

City Group variable Encoding

In [9]:
city_group_dummies=pd.get_dummies(train['City Group'])
train = pd.concat([train,city_group_dummies],axis=1)

city_group_dummies=pd.get_dummies(test['City Group'])
test = pd.concat([test,city_group_dummies],axis=1)

Type variable Encoding

In [10]:
type_dummies=pd.get_dummies(train['Type'].astype(CategoricalDtype(["FC","IL","DT","MB"])))
train = pd.concat([train,type_dummies],axis=1)

type_dummies=pd.get_dummies(test['Type'])
test = pd.concat([test,type_dummies],axis=1)

#### Coverting date to year

In [11]:
train['year'] = pd.DatetimeIndex(train['Open Date']).year
test['year'] = pd.DatetimeIndex(test['Open Date']).year

#### Droping unnecessary variables

In [12]:
train.drop(['Id','Open Date','City','City Group','Type'],axis=1,inplace=True)
test.drop(['Id','Open Date','City','City Group','Type'],axis=1,inplace=True)

#### Check dimension of train and test

In [13]:
print(train.shape)
print(test.shape)

(137, 108)
(100000, 107)


## Feature Selection

In [14]:
X = train.drop("revenue",axis=1)
y = train["revenue"]

## Spliting Train into train and test

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state = 1)

## Regression models

### Simple linear regression model

In [16]:
simple_reg = LinearRegression().fit(X_train.iloc[:,0:1], y_train)

In [17]:
sy_predicted = simple_reg.predict(X_test.iloc[:,0:1])


### Multiple linear regression model

In [18]:
multi_reg = LinearRegression().fit(X_train, y_train)

In [19]:
my_predicted = multi_reg.predict(X_test)

### Polynomial Regression Model

In [20]:
poly = PolynomialFeatures()
X_poly = poly.fit_transform(X_train)
poly_reg = LinearRegression().fit(X_poly, y_train)

In [21]:
py_predicted = poly_reg.predict(poly.fit_transform(X_test))

### DecisionTree Regressor

In [22]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [23]:
dy_predicted = dt_reg.predict(X_test)

### Random Forest

In [24]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

RandomForestRegressor()

In [25]:
ry_predicted = rf_reg.predict(X_test)

## RMSE of Five models

In [26]:
rmse = pd.DataFrame({'RMSE':[math.sqrt(mean_squared_error(sy_predicted,y_test)),
                             math.sqrt(mean_squared_error(my_predicted,y_test)),
                             math.sqrt(mean_squared_error(py_predicted,y_test)),
                             math.sqrt(mean_squared_error(dy_predicted,y_test)),
                             math.sqrt(mean_squared_error(ry_predicted,y_test))]},                                      
                     index=["Simple Linear regression","Multiple Linear regression",
                            "Polynomial regression","Decision Tree",
                            "Random forest"])
rmse.head()

Unnamed: 0,RMSE
Simple Linear regression,3448971.0
Multiple Linear regression,3871366.0
Polynomial regression,7407362.0
Decision Tree,4420511.0
Random forest,3692638.0


## Specific to Random Forest

### Saving Tree Picture

In [27]:
tree = rf_reg.estimators_[5]
export_graphviz(tree, out_file = 'tree.dot', feature_names = X_train.columns, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [28]:
graph.write_png('tree.png')

### Best estimators

In [29]:
importances = list(rf_reg.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train.columns, importances)]

feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: P29                  Importance: 0.27
Variable: year                 Importance: 0.07
Variable: P17                  Importance: 0.06
Variable: İzmir                Importance: 0.05
Variable: P2                   Importance: 0.04
Variable: P20                  Importance: 0.04
Variable: P23                  Importance: 0.04
Variable: P28                  Importance: 0.04
Variable: İstanbul             Importance: 0.04
Variable: P1                   Importance: 0.03
Variable: P22                  Importance: 0.03
Variable: P3                   Importance: 0.02
Variable: P6                   Importance: 0.02
Variable: P11                  Importance: 0.02
Variable: P19                  Importance: 0.02
Variable: P4                   Importance: 0.01
Variable: P5                   Importance: 0.01
Variable: P8                   Importance: 0.01
Variable: P9                   Importance: 0.01
Variable: P10                  Importance: 0.01
Variable: P12                  Importanc

### Random Forest with best estimators

In [30]:
rf_most_important = RandomForestRegressor()

train_important = X_train[["P29"]]
test_important = X_test[["P29"]]

rf_most_important.fit(train_important, y_train)

RandomForestRegressor()

In [31]:
rf_predicted = rf_most_important.predict(test_important)

### Comparing Random forest and Random forest with best estimators

In [32]:
rmse = pd.DataFrame({'RMSE':[math.sqrt(mean_squared_error(ry_predicted,y_test)),
                             math.sqrt(mean_squared_error(rf_predicted,y_test))]},                                      
                     index=["Random forest","Random forest with best estimators"])
rmse.head()

Unnamed: 0,RMSE
Random forest,3692638.0
Random forest with best estimators,3892087.0
