# OnlineNewsPopularity dataset
[Source](https://www.openml.org/search?type=data&sort=qualities.NumberOfFeatures&status=active&qualities.NumberOfClasses=lte_1&qualities.NumberOfFeatures=between_10_100&format=ARFF&qualities.NumberOfInstances=between_1000_10000&id=4545)

Goal: predict the number of shares in social networks (popularity).

Perfect challenge for a random forest tree regressor.

39644 instances

61 features

Each instance is an article publication and all features are different characteristics of the articles.

### Import libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import sklearn
from random_forest_regressor import RandomForestRegressor
import sklearn.ensemble
import time

RANDOM_SEED = 42


### Import data

In [2]:
raw_data = loadarff('phpgBMvy4.arff')
df = pd.DataFrame(raw_data[0])

In [3]:
df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,b'http://mashable.com/2013/01/07/amazon-instan...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593.0
1,b'http://mashable.com/2013/01/07/ap-samsung-sp...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711.0
2,b'http://mashable.com/2013/01/07/apple-40-bill...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500.0
3,b'http://mashable.com/2013/01/07/astronaut-not...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200.0
4,b'http://mashable.com/2013/01/07/att-u-verse-a...,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505.0


In [None]:
df.shape # (39644	61)

In [12]:
df.isna().any().sum()

np.int64(0)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   url                            39644 non-null  object 
 1   timedelta                      39644 non-null  float64
 2   n_tokens_title                 39644 non-null  float64
 3   n_tokens_content               39644 non-null  float64
 4   n_unique_tokens                39644 non-null  float64
 5   n_non_stop_words               39644 non-null  float64
 6   n_non_stop_unique_tokens       39644 non-null  float64
 7   num_hrefs                      39644 non-null  float64
 8   num_self_hrefs                 39644 non-null  float64
 9   num_imgs                       39644 non-null  float64
 10  num_videos                     39644 non-null  float64
 11  average_token_length           39644 non-null  float64
 12  num_keywords                   39644 non-null 

In [14]:
df.empty

False

Based on the description of the variables, we will drop "url", since it works as an ID, and also drop "timedelta", since it represents days between the article publication and the dataset acquisition (non-predictive)

In [15]:
df.drop(columns=["url", "timedelta"], axis=1, inplace=True)

In [16]:
df.head()

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593.0
1,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,0.0,4.913725,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711.0
2,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,0.0,4.393365,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500.0
3,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,0.0,4.404896,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200.0
4,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,0.0,4.682836,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505.0


### Prepare the data

In [17]:
X_data = df.drop("shares", axis=1)
y_data = df["shares"]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_SEED)

## Run the experiments

### Baseline experiment

In [18]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [19]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [20]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  10985.340310784997
RMSE of sklearn:  12049.228010015688
Runtime of implementation 617.0055634975433
Runtime of sklearn 1.4171433448791504


### Experiments on trees depth
**1. Decrease tree depth to 2 instead of 5** 

In [21]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [22]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [23]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  10987.116007862305
RMSE of sklearn:  10821.73138056426
Runtime of implementation 292.96818494796753
Runtime of sklearn 0.5936460494995117


**2. Increase tree depth to None instead of 5**

In [24]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=None)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

KeyboardInterrupt: 

In [None]:
import sys
print(sys.getrecursionlimit())

3000


By printing the recursion limit, we can see that our implementation exceeded 3000 recursions

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=None, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
#implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
#print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
#implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
#print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of sklearn:  0.2814079075177691
Runtime of sklearn 0.3791069984436035


**3. Increase tree depth to 20 instead of 5**

In [None]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=2, max_depth=20)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=20, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.23957197065958202
RMSE of sklearn:  0.27971263323518153
Runtime of implementation 118.94622135162354
Runtime of sklearn 0.39827799797058105


## Experiments on max features
**1. Increase max features to 2**

In [None]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=2, min_samples_split=2, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.23957197065958202
RMSE of sklearn:  0.23570879464546549
Runtime of implementation 118.94622135162354
Runtime of sklearn 0.03200674057006836


**2. Increase max features to 5**

In [None]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=5, min_samples_split=2, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=5,random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.23957197065958202
RMSE of sklearn:  0.2355863726400265
Runtime of implementation 118.94622135162354
Runtime of sklearn 0.04115128517150879


## Experiments on min samples splits
**1. Increase min_sample_split to 5**

In [None]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=5, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=1, min_samples_split=5, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.23957197065958202
RMSE of sklearn:  0.23670866714860314
Runtime of implementation 118.94622135162354
Runtime of sklearn 0.011986255645751953


**2. Increase min_sample_split to 10**

In [None]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=5, max_features=1, min_samples_split=10, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=5, max_depth=5, max_features=1, min_samples_split=10, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.23971056926151618
RMSE of sklearn:  0.2366636121308085
Runtime of implementation 142.0025818347931
Runtime of sklearn 0.009978532791137695


## Experiments on number of trees
**1. Increasing number of trees to 30**

In [None]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=30, max_features=1, min_samples_split=2, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=30, max_depth=5, max_features=1, min_samples_split=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.23897789683634196
RMSE of sklearn:  0.2364491907196206
Runtime of implementation 436.18719482421875
Runtime of sklearn 0.033196210861206055


**2. Increasing number of trees to 100**

In [None]:
start_time_implementation = time.time()
rf = RandomForestRegressor(n_trees=100, max_features=1, min_samples_split=2, max_depth=5)
rf.train(X_train, y_train)
y_pred = rf.predict(X_test)
end_time_implementation = time.time()

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=100, max_depth=5, max_features=1, min_samples_split=2, random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
implementation_rmse = root_mean_squared_error(y_test, y_pred)
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of implementation: ", implementation_rmse)
print("RMSE of sklearn: ", sklearn_rmse)
implementation_runtime = end_time_implementation - start_time_implementation
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of implementation", implementation_runtime)
print("Runtime of sklearn", sklearn_runtime)

RMSE of implementation:  0.2388503304319085
RMSE of sklearn:  0.23623599489908745
Runtime of implementation 1459.9423739910126
Runtime of sklearn 0.09917593002319336


**Experiment on default values with sklearn RandomForestRegressor for comparison**

In [None]:
start_time_sk = time.time()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(random_state=RANDOM_SEED)
sklearn_rf.fit(X_train, y_train)
sk_y_pred = sklearn_rf.predict(X_test)
end_time_sk = time.time()

In [None]:
sklearn_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("RMSE of sklearn: ", sklearn_rmse)
sklearn_runtime = end_time_sk - start_time_sk
print("Runtime of sklearn", sklearn_runtime)

RMSE of sklearn:  0.26191354302456943
Runtime of sklearn 4.385132074356079
