# puma32H dataset
[Source](https://www.openml.org/search?type=data&sort=qualities.NumberOfFeatures&status=active&qualities.NumberOfClasses=lte_1&qualities.NumberOfFeatures=between_10_100&format=ARFF&qualities.NumberOfInstances=between_1000_10000&id=308)

Goal: predict the angular accelaration of one of the robot arm's links

Perfect challenge for a random forest tree regressor.

8192 instances

33 features

This dataset includes a realistic simulation of the dynamics of a Puma 560 robot arm.

### Import libraries

In [38]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
import sklearn
from random_forest_regressor import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import sklearn.ensemble
import time

RANDOM_SEED = 42


### Import data

In [2]:
raw_data = loadarff('phpJEvZWG.arff')
df = pd.DataFrame(raw_data[0])

In [3]:
df.head()

Unnamed: 0,theta1,theta2,theta3,theta4,theta5,theta6,thetad1,thetad2,thetad3,thetad4,...,da2,da3,da4,da5,db1,db2,db3,db4,db5,thetadd6
0,0.73646,-1.761829,1.590594,-0.268853,0.572145,-1.941886,0.727704,1.869884,0.224501,-1.442215,...,2.220872,2.440442,0.510303,1.157391,0.265448,1.141465,0.356314,0.568853,1.954376,0.052627
1,-0.389711,0.342256,-1.522463,0.237098,-1.771509,-0.885488,-0.679111,-0.410219,-0.331288,2.092878,...,1.580327,0.809171,2.235141,1.517466,1.795334,0.929355,1.663727,0.754457,0.650492,0.001308
2,-0.269351,1.622452,-2.047811,1.720603,-1.749964,-1.618348,0.327188,-0.317671,0.046938,1.911881,...,1.723625,1.718983,2.438604,1.113059,2.18027,1.794781,0.627965,0.961728,1.258398,0.003834
3,0.25684,0.16504,-1.776401,1.723357,2.117348,-1.692605,1.077334,1.761624,-0.333281,1.57386,...,1.369922,1.528851,1.901706,2.222391,0.968513,0.546513,2.23634,1.412382,1.898374,-0.00201
4,0.96827,1.834561,0.299747,0.308144,0.064617,2.174855,-1.213028,-1.563548,-2.27727,-0.059009,...,1.171433,2.273871,2.199479,0.6464,0.302676,1.323917,2.324899,0.33422,0.491228,0.015778


In [4]:
df.shape # (39644	61)

(8192, 33)

In [5]:
df.isna().any().sum()

np.int64(0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8192 entries, 0 to 8191
Data columns (total 33 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   theta1    8192 non-null   float64
 1   theta2    8192 non-null   float64
 2   theta3    8192 non-null   float64
 3   theta4    8192 non-null   float64
 4   theta5    8192 non-null   float64
 5   theta6    8192 non-null   float64
 6   thetad1   8192 non-null   float64
 7   thetad2   8192 non-null   float64
 8   thetad3   8192 non-null   float64
 9   thetad4   8192 non-null   float64
 10  thetad5   8192 non-null   float64
 11  thetad6   8192 non-null   float64
 12  tau1      8192 non-null   float64
 13  tau2      8192 non-null   float64
 14  tau3      8192 non-null   float64
 15  tau4      8192 non-null   float64
 16  tau5      8192 non-null   float64
 17  dm1       8192 non-null   float64
 18  dm2       8192 non-null   float64
 19  dm3       8192 non-null   float64
 20  dm4       8192 non-null   floa

In [7]:
df.empty

False

Based on the description of the variables, we will drop "url", since it works as an ID, and also drop "timedelta", since it represents days between the article publication and the dataset acquisition (non-predictive)

### Prepare the data

In [8]:
X_data = df.drop("thetadd6", axis=1)
y_data = df["thetadd6"]


X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_SEED)

## Run the experiments

### Baseline experiment

In [10]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [11]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [12]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.02942777195908711
MAE of implementation:  0.022627228393050633
RMSE of sklearn:  0.0111023799842939
MAE of sklearn:  0.00873443982270454
Runtime of implementation 122.92449593544006
Runtime of sklearn 0.37678050994873047


### Experiments on trees depth
**1. Decrease tree depth to 2 instead of 5** 

In [13]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=2, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [14]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [15]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)

print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.0294180123909433
MAE of implementation:  0.022618471211399725
RMSE of sklearn:  0.023396654004291365
MAE of sklearn:  0.01804306602457138
Runtime of implementation 48.63184690475464
Runtime of sklearn 0.16441941261291504


**2. Increase tree depth to None instead of 5**

In [24]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=None, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

KeyboardInterrupt: 

In [None]:
import sys
print(sys.getrecursionlimit())

3000


By printing the recursion limit, we can see that our implementation exceeded 3000 recursions

In [12]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=None, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [13]:
#implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
#print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("MAE of sklearn: ", sklearn_mae)
#implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
#print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)


# implementation_rmse = root_mean_squared_error(y_test, y_pred)
# implementation_mae = mean_absolute_error(y_test, y_pred)
# sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
# sklearn_mae = mean_absolute_error(y_test, sk_y_pred)

# print("RMSE of implementation: ", implementation_rmse)
# print("MAE of implementation: ", implementation_mae)
# print("RMSE of sklearn: ", sklearn_rmse)
# print("MAE of sklearn: ", sklearn_mae)

RMSE of sklearn:  12582.32145962745
Runtime of sklearn 9.121507167816162


## Experiments on max features
**1. Increase max features to 2**

In [16]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=2, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [17]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [18]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.029426646373290823
MAE of implementation:  0.022626216659288847
RMSE of sklearn:  0.02928887207891027
MAE of sklearn:  0.022543752280621825
Runtime of implementation 236.5267469882965
Runtime of sklearn 0.04462265968322754


**2. Increase max features to 5**

In [19]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=5, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [20]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=5,random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [21]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.029409324634598104
MAE of implementation:  0.022610411183165556
RMSE of sklearn:  0.02746698187678999
MAE of sklearn:  0.021319128450680482
Runtime of implementation 572.8327031135559
Runtime of sklearn 0.0940251350402832


## Experiments on min samples splits
**1. Increase min_sample_split to 5**

In [22]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=5, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [23]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=1, min_samples_split=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [24]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.029412504761813382
MAE of implementation:  0.02261342845185598
RMSE of sklearn:  0.028728216878557974
MAE of sklearn:  0.022135444564221635
Runtime of implementation 119.62125492095947
Runtime of sklearn 0.025078296661376953


**2. Increase min_sample_split to 10**

In [25]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=10, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [26]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=1, min_samples_split=10, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [27]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.029414766386084784
MAE of implementation:  0.022615558046876955
RMSE of sklearn:  0.02840280752170343
MAE of sklearn:  0.021892236155497055
Runtime of implementation 118.49422836303711
Runtime of sklearn 0.025309324264526367


## Experiments on number of trees
**1. Increasing number of trees to 30**

In [28]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=30, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [29]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=30, max_depth=5, max_features=1, min_samples_split=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [30]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.02939355692004114
MAE of implementation:  0.022595809911376965
RMSE of sklearn:  0.028842184038434398
MAE of sklearn:  0.02221679346541146
Runtime of implementation 695.7822754383087
Runtime of sklearn 0.12179279327392578


**2. Increasing number of trees to 100**

In [32]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=100, max_features=1, min_samples_split=2, max_depth=5, random_state=RANDOM_SEED)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [33]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=100, max_depth=5, max_features=1, min_samples_split=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [34]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
implementation_mae = mean_absolute_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("MAE of implementation: ", implementation_mae)
print("RMSE of sklearn: ", sklearn_rmse)
print("MAE of sklearn: ", sklearn_mae)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.029391143141854415
MAE of implementation:  0.022595595762205472
RMSE of sklearn:  0.028362171580740274
MAE of sklearn:  0.021881752971350185
Runtime of implementation 3417.44739985466
Runtime of sklearn 0.7584710121154785


**Experiment on default values with sklearn RandomForestRegressor for comparison**

In [35]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [36]:
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of sklearn: ", sklearn_rmse)
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of sklearn", sklearn_runtime)
sklearn_mae = mean_absolute_error(y_test, sk_y_pred)
print("MAE of sklearn: ", sklearn_mae)

RMSE of sklearn:  0.007927639549178677
Runtime of sklearn 12.581874370574951
MAE of sklearn:  0.006345761073825503


In [39]:
scaler_ss = StandardScaler()
X_train = scaler_ss.fit_transform(X_train)
X_test=scaler_ss.transform(X_test)

start_time_knn = time.time()
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)
end_time_knn = time.time()

In [40]:
knn_rmse = root_mean_squared_error(y_test, knn_y_pred)
print("RMSE of knn: ", knn_rmse)
knn_mae = mean_absolute_error(y_test, knn_y_pred)
print("MAE of knn: ", knn_mae)
knn_runtime = end_time_knn - start_time_knn
print("Runtime of knn", knn_runtime)

RMSE of knn:  0.02763746692014702
MAE of knn:  0.021907218913971935
Runtime of knn 0.1719827651977539
