In [37]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import time

from random_forest_regressor import RandomForestRegressor
from sklearn import ensemble
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error

RANDOM_SEED=42

For this dataset, we have 398 rows and 9 columns, all numeric except car name and horsepower. <br>
We drop Car name since it is a column that can work like an "ID" and we don't believe it is important for the model. <br>
For horsepower we transform the column into a float and turn the missing values '?' into nan, which are then imputed with SimpleImputer, using then median. <br>
Since we will use this data for a RandomForest Model, scaling the data is not a necessary step.


In [2]:
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
# read data 
data = pd.read_csv('auto-mpg.data', sep='\\s+', header=None, names=columns, quotechar='"')

# drop car_name - Like id
data.drop(["car_name"], axis=1, inplace=True)

data.dtypes #horsepower is object but should be float

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
dtype: object

In [3]:
# Replace '?' with NaN
data['horsepower']=pd.to_numeric(data['horsepower'], errors='coerce')
data[data.isnull().any(axis=1)]


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
32,25.0,4,98.0,,2046.0,19.0,71,1
126,21.0,6,200.0,,2875.0,17.0,74,1
330,40.9,4,85.0,,1835.0,17.3,80,2
336,23.6,4,140.0,,2905.0,14.3,80,1
354,34.5,4,100.0,,2320.0,15.8,81,2
374,23.0,4,151.0,,3035.0,20.5,82,1


In [4]:
imp_median =SimpleImputer(strategy='median', missing_values=np.nan)
data_clean=pd.DataFrame(imp_median.fit_transform(data), columns=data.columns)
data_clean

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0
...,...,...,...,...,...,...,...,...
393,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,1.0
394,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,2.0
395,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,1.0
396,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,1.0


In [40]:
X = data_clean.drop(['mpg'], axis=1)
y = data_clean['mpg']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Experiments on Hyperparameters

#### Base line 

In [6]:
#default value for parameters
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [7]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=5, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [8]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)

print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)

implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.38586166939917
MAE of implementation:  9.81204342251951
RMSE of sklearn:  2.9682587098347897
MAE of sklearn:  2.215011232989232
Runtime of implementation 3.082282781600952
Runtime of sklearn 0.007988691329956055


#### Number of Trees 30

In [9]:
# n trees 30
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=30, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [10]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=30, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [11]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.014588639614686
MAE of implementation:  9.468167452563751
RMSE of sklearn:  2.522076400401327
MAE of sklearn:  1.9325292649077306
Runtime of implementation 18.36820101737976
Runtime of sklearn 0.024845361709594727


#### Number of Trees 100

In [12]:
# n trees 100
start_time_implementation = time.time() 
rf = RandomForestRegressor(n_trees=100, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [13]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=100, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [14]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)

print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.462246015384688
MAE of implementation:  9.88177145155415
RMSE of sklearn:  2.4839273545421685
MAE of sklearn:  1.8819321369386526
Runtime of implementation 61.3521032333374
Runtime of sklearn 0.06237459182739258


#### Max depth 2

In [15]:
# max depth  2
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=2, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [16]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=5, max_features=1, min_samples_split=2, max_depth=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [17]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.387936017672676
MAE of implementation:  9.81394134528318
RMSE of sklearn:  3.835457941916181
MAE of sklearn:  3.112220184509068
Runtime of implementation 1.406388759613037
Runtime of sklearn 0.007781505584716797


#### Max depth None

In [18]:
# max depth  none
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=None, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

In [19]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=2, max_features=1, min_samples_split=2, max_depth=None, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [20]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.387936017672676
MAE of implementation:  9.81394134528318
RMSE of sklearn:  3.4988748191382895
MAE of sklearn:  2.6125
Runtime of implementation -0.029045820236206055
Runtime of sklearn 0.004117727279663086


#### Max Features 2

In [21]:
# max features 2
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=2, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [22]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=5, max_features=2, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [23]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.368850396237862
MAE of implementation:  9.796469710344972
RMSE of sklearn:  2.5232343448242727
MAE of sklearn:  1.8676934683370494
Runtime of implementation 6.646385908126831
Runtime of sklearn 0.011179447174072266


#### Max Features 5

In [24]:
# max features 5
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=5, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [25]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=5, max_features=5, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [26]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.326123172928744
MAE of implementation:  9.757279682624091
RMSE of sklearn:  2.867168598249842
MAE of sklearn:  1.9788830660061265
Runtime of implementation 16.153379201889038
Runtime of sklearn 0.006373405456542969


#### Min Samples Split 5

In [27]:
# min_samples_split 5
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=5, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [28]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=5, max_features=1, min_samples_split=5, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [29]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.328046705037622
MAE of implementation:  9.759046254878786
RMSE of sklearn:  2.892538025647866
MAE of sklearn:  2.279381039355979
Runtime of implementation 3.423983097076416
Runtime of sklearn 0.007489442825317383


#### Min Samples Split 10

In [30]:
#min_samples_split 10
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=10, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [31]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(n_estimators=5, max_features=1, min_samples_split=10, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [32]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  11.329850230333912
MAE of implementation:  9.760702416367561
RMSE of sklearn:  2.599358192716391
MAE of sklearn:  2.0463838934383127
Runtime of implementation 3.5017473697662354
Runtime of sklearn 0.007673740386962891


Default hyperparameter values

In [33]:
start_time_sk = time.time()
sklearn_rf = ensemble.RandomForestRegressor(random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [34]:
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of sklearn: ", sklearn_rmse)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("MAE of sklearn: ", sklearn_mae)
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of sklearn", sklearn_runtime)

RMSE of sklearn:  2.1429114161579332
MAE of sklearn:  1.5780125
Runtime of sklearn 0.1288132667541504


#### Experiment LLM Random Forest

In [None]:
from llm_implementation import RandomForestRegressor as LLM_RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Train Random Forest
rf = LLM_RandomForestRegressor(n_estimators=10, max_depth=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate
print("MSE:", mean_squared_error(y_test, y_pred))

#### KNN

In [38]:
scaler_ss = StandardScaler()
X_train = scaler_ss.fit_transform(X_train)
X_test=scaler_ss.transform(X_test)

start_time_knn = time.time()
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)
end_time_knn = time.time()

In [39]:
knn_rmse = root_mean_squared_error(y_test, knn_y_pred)
print("RMSE of knn: ", knn_rmse)
knn_mae = mean_absolute_error(y_test, knn_y_pred)
print("MAE of knn: ", knn_mae)
knn_runtime = end_time_knn - start_time_knn
print("Runtime of knn", knn_runtime)

RMSE of knn:  2.3239094216427625
MAE of knn:  1.88675
Runtime of knn 0.011890649795532227
