In [None]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

fb_df = pd.read_csv('dataset_Facebook.csv', sep=';')#, nrows=200)

# fill NaN
fb_df['like'].fillna(0,inplace=True)
fb_df['share'].fillna(0,inplace=True)
fb_df['Paid'].fillna(0,inplace=True)
fb_df.drop(['Type'], inplace=True, axis=1)


# get column names
column_names = fb_df.columns

# get data/target names
f_n = column_names[:14]
t_n = column_names[14:]

X = fb_df[f_n]
y = fb_df[t_n]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

scaler = MinMaxScaler(feature_range=(1,10))
y = scaler.fit_transform(y)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)


max_ds = [2, 5, 10]
min_s_ls = [5, 10]
m_fs = ['sqrt', 'log2', 0.75, None]
ests = [100, 500, 1000]
total = (len(max_ds)*len(min_s_ls)*len(m_fs)*len(ests))
counter = 1
min_mae = 10**10 # just a ref, should be > than len(feature_names)
for max_d in max_ds:
  for min_s_l in min_s_ls:
    for m_f in m_fs:
      for est in ests:
        RFmodel = RandomForestRegressor(n_estimators=est, max_depth=max_d, min_samples_leaf=min_s_l, max_features=m_f, n_jobs=-1, random_state=42)
        RFmodel.fit(X_train, y_train)
        pred = RFmodel.predict(X_test)
        mae = mean_absolute_error(pred, y_test)
        if mae < min_mae:
          min_mae = mae
          best_par = {"test:":counter, "max_d=":max_d, "min_s_l=":min_s_l, "m_f=": m_f, "est=": est, "mae=": min_mae}
        print(counter, '/', total, "| max_depth=",max_d," | ", "min_sample_leaf=",min_s_l," | ", "max_f=", m_f," | ", "est=", est, "mae=", mae)
        #print(" ", pred[1])
        #print(" ", y_test[1])
        counter += 1

1 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= sqrt  |  est= 100 mae= 0.13905740290611496
2 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= sqrt  |  est= 500 mae= 0.13810806245505342
3 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= sqrt  |  est= 1000 mae= 0.13869566626796206
4 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= log2  |  est= 100 mae= 0.13905740290611496
5 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= log2  |  est= 500 mae= 0.13810806245505342
6 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= log2  |  est= 1000 mae= 0.13869566626796206
7 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= 0.75  |  est= 100 mae= 0.14359264838544764
8 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= 0.75  |  est= 500 mae= 0.13814665256727993
9 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= 0.75  |  est= 1000 mae= 0.1383630585091286
10 / 72 | max_depth= 2  |  min_sample_leaf= 5  |  max_f= None  |  est= 100 mae= 0.14182974706703794
11 / 72

In [None]:
best_par

{'test:': 56,
 'max_d=': 10,
 'min_s_l=': 5,
 'm_f=': 0.75,
 'est=': 500,
 'mae=': 0.1008790132759901}