## ***Import Libraries***

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## ***Load The Data***

In [4]:
df = pd.read_csv('sample_data/Carseats.csv')

## ***Explore The Data***

In [5]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [6]:
df.shape

(400, 11)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


# **Encoder les variables qualitatives**

In [8]:
df['Urban'].unique()

array(['Yes', 'No'], dtype=object)

In [9]:
df['US'].unique()

array(['Yes', 'No'], dtype=object)

In [10]:
map = {'Yes':'1','No':'0'}
df['Urban']=df['Urban'].map(map)

In [11]:
df.tail()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
395,12.57,138,108,17,203,128,Good,33,14,1,Yes
396,6.14,139,23,3,37,120,Medium,55,11,0,Yes
397,7.41,162,26,12,368,159,Medium,40,18,1,Yes
398,5.94,100,79,7,284,95,Bad,50,12,1,Yes
399,9.71,134,37,0,27,120,Good,49,16,1,Yes


In [12]:
map = {'Yes':'1','No':'0'}
df['US']=df['US'].map(map)

In [13]:
df.tail()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
395,12.57,138,108,17,203,128,Good,33,14,1,1
396,6.14,139,23,3,37,120,Medium,55,11,0,1
397,7.41,162,26,12,368,159,Medium,40,18,1,1
398,5.94,100,79,7,284,95,Bad,50,12,1,1
399,9.71,134,37,0,27,120,Good,49,16,1,1


In [14]:
df['ShelveLoc'].unique()

array(['Bad', 'Good', 'Medium'], dtype=object)

In [15]:
map = {'Bad':'1','Medium':'2','Good':'3'}
df['ShelveLoc'] = df['ShelveLoc'].map(map)

In [16]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,1,42,17,1,1
1,11.22,111,48,16,260,83,3,65,10,1,1
2,10.06,113,35,10,269,80,2,59,12,1,1
3,7.4,117,100,4,466,97,2,55,14,1,1
4,4.15,141,64,3,340,128,1,38,13,1,0


## ***Building The Random Forest Model***

In [17]:
X = df.drop('Sales',axis=1)
y = df['Sales']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
Tree_model = DecisionTreeRegressor()
param_grid = {'max_depth': [ i for i in range (1, 16)]}

In [21]:
grid_search = GridSearchCV(Tree_model, param_grid, cv=5)
grid_search.fit(X_train,y_train)

In [None]:
print(grid_search.best_params_)

{'max_depth': 3}


In [22]:
model = grid_search.best_estimator_

In [24]:
rfreg = RandomForestRegressor(n_estimators=15)
param_grid = {
    'n_estimators': [50, 1100, 50],
    'max_depth': [ i for i in range (2, 11)],
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [25]:
grid_cv = GridSearchCV(rfreg, param_grid, cv=kf)
grid_cv.fit(X_train, y_train)

In [26]:
print(grid_cv.best_estimator_)

RandomForestRegressor(max_depth=10, n_estimators=1100)


In [27]:
model = grid_cv.best_estimator_

In [30]:
predictions = model.predict(X_test)
mse = mean_squared_error(predictions, y_test)
print(f"Root Mean Square Error {np.sqrt(mse): .2f}")

Root Mean Square Error  1.58


In [31]:
df.Sales.mean()

7.496325000000001

## ***Comparing Random Forest Model with The Linear Regression Model***

In [32]:
from sklearn.linear_model import LinearRegression

In [33]:
l = LinearRegression()
l.fit(X_train,y_train)

In [34]:
y_pred = l.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(f"Root Mean Square Error {np.sqrt(mse): .2f}")

Root Mean Square Error  1.11
