In [50]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import time

from random_forest_regressor import RandomForestRegressor
from sklearn import ensemble
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error

For this dataset, we have 398 rows and 9 columns, all numeric except car name and horsepower. <br>
We drop Car name since it is a column that can work like an "ID" and we don't believe it is important for the model. <br>
For horsepower we transform the column into a float and turn the missing values '?' into nan, which are then imputed with SimpleImputer, using then mean. <br>
Since we will use this data for a RandomForest Model, scaling the data is not a necessary step.


In [3]:
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
# read data 
data = pd.read_csv('auto-mpg.data', sep='\\s+', header=None, names=columns, quotechar='"')

# drop car_name - Like id
data.drop(["car_name"], axis=1, inplace=True)

data.dtypes #horsepower is object but should be float

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
dtype: object

In [4]:
# Replace '?' with NaN
data['horsepower']=pd.to_numeric(data['horsepower'], errors='coerce')
data[data.isnull().any(axis=1)]


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
32,25.0,4,98.0,,2046.0,19.0,71,1
126,21.0,6,200.0,,2875.0,17.0,74,1
330,40.9,4,85.0,,1835.0,17.3,80,2
336,23.6,4,140.0,,2905.0,14.3,80,1
354,34.5,4,100.0,,2320.0,15.8,81,2
374,23.0,4,151.0,,3035.0,20.5,82,1


In [5]:
imp_median =SimpleImputer(strategy='median', missing_values=np.nan)
data_clean=pd.DataFrame(imp_median.fit_transform(data), columns=data.columns)
data_clean

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0
...,...,...,...,...,...,...,...,...
393,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,1.0
394,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,2.0
395,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,1.0
396,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,1.0


In [6]:
X = data_clean.drop(['mpg'], axis=1)
y = data_clean['mpg']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Experiments on Hyperparameters

#### Base line 

In [8]:
#default value for parameters
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [9]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [10]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.399330685137706
RMSE of sklearn:  4.368953470323409
Runtime of implementation 1.2422239780426025
Runtime of sklearn 0.019565820693969727


#### Number of Trees 5

In [12]:
# n trees 5
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [13]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=5, max_features=1, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [14]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.136436427438854
RMSE of sklearn:  3.835457941916181
Runtime of implementation 2.989116907119751
Runtime of sklearn 0.011967897415161133


#### Number of Trees 100

In [15]:
# n trees 100
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=100, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [16]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=100, max_features=1, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [17]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.591181508916089
RMSE of sklearn:  3.310487377319611
Runtime of implementation 43.19919228553772
Runtime of sklearn 0.10717153549194336


#### Number of Trees 30

In [18]:
# n trees 30
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=30, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [19]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=30, max_features=1, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [20]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.553988101779971
RMSE of sklearn:  3.3609497750291752
Runtime of implementation 14.145478248596191
Runtime of sklearn 0.03712296485900879


#### Max depth 2

In [21]:
# max depth  2
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [22]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [23]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.584886830302128
RMSE of sklearn:  4.368953470323409
Runtime of implementation 1.528993844985962
Runtime of sklearn 0.012218713760375977


#### Max depth 5

In [24]:
# max depth  5
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [25]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=2, max_depth=5, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [26]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.596566248433618
RMSE of sklearn:  3.438397803218629
Runtime of implementation 3.244258403778076
Runtime of sklearn 0.008433103561401367


#### Max depth None

In [27]:
# max depth  none
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=None)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [28]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=2, max_depth=None, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [29]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.633423959981265
RMSE of sklearn:  3.4988748191382895
Runtime of implementation 8.12107515335083
Runtime of sklearn 0.011278629302978516


#### Max Features 1

In [30]:
# max features 1
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [31]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [32]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.591475407102223
RMSE of sklearn:  4.368953470323409
Runtime of implementation 1.5681066513061523
Runtime of sklearn 0.009543657302856445


#### Max Features 2

In [33]:
# max features 2
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=2, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [34]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=2, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [35]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.62490027547944
RMSE of sklearn:  3.5719375119435752
Runtime of implementation 2.759108066558838
Runtime of sklearn 0.010189056396484375


#### Max Features 5

In [36]:
# max features 5
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=5, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [37]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=5, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [38]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.625539102743836
RMSE of sklearn:  3.3750707273442777
Runtime of implementation 5.981765031814575
Runtime of sklearn 0.00845193862915039


#### Min Samples Split 2

In [39]:
# min_samples_split 2
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [40]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=2, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [41]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.622637218361325
RMSE of sklearn:  4.368953470323409
Runtime of implementation 1.5630230903625488
Runtime of sklearn 0.011353731155395508


#### Min Samples Split 5

In [42]:
# min_samples_split 5
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=5, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [43]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=5, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [44]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.646528904173673
RMSE of sklearn:  4.368953470323409
Runtime of implementation 1.5384032726287842
Runtime of sklearn 0.008997440338134766


#### Min Samples Split 10

In [45]:
#min_samples_split 10
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=10, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [46]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=10, max_depth=2, random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [47]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  9.632730957854674
RMSE of sklearn:  4.368953470323409
Runtime of implementation 1.557582139968872
Runtime of sklearn 0.012628793716430664


In [51]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(random_state=42)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [52]:
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of sklearn: ", sklearn_rmse)
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of sklearn", sklearn_runtime)

RMSE of sklearn:  2.1429114161579332
Runtime of sklearn 0.13109517097473145


In [53]:
start_time_knn = time.time()
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)
end_time_knn = time.time()

In [54]:
knn_rmse = root_mean_squared_error(y_test, knn_y_pred)
print("RMSE of knn: ", knn_rmse)
knn_runtime = end_time_knn - start_time_knn
print("Runtime of knn", knn_runtime)

RMSE of knn:  3.547021990346268
Runtime of knn 0.02063751220703125
