In [1]:
import sys
import sqlite3
import pandas as pd
import numpy as np

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib

In [29]:
db = "../../links.db"
output_model = 'model.pkl'

In [7]:
sys.path.append('..')

In [8]:
conn = sqlite3.connect(db)

In [9]:
df = pd.read_sql("SELECT * FROM FEATURES WHERE APPEARENCES is not null", conn)

In [10]:
df.head()

Unnamed: 0,LINK,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,APPEARENCES
0,https://www.google.com.br,25.0,4.0,21.0,5.25,17.0,0.68,97.823529,0.0,0.0,0.0,0
1,https://www.google.com.br/webhp,31.0,5.0,26.0,5.2,17.0,0.548387,97.823529,1.0,0.051112,0.0,3
2,https://maps.google.com.br/maps,31.0,6.0,25.0,4.166667,18.0,0.580645,96.611111,1.0,0.041403,0.0,5
3,https://play.google.com/,24.0,5.0,19.0,3.8,15.0,0.625,99.066667,1.0,0.0,0.0,5
4,https://www.youtube.com/,24.0,5.0,19.0,3.8,15.0,0.625,103.266667,1.0,0.0,0.0,6


In [11]:
df.dtypes

LINK            object
F1             float64
F2             float64
F3             float64
F4             float64
F5             float64
F6             float64
F7             float64
F8             float64
F9             float64
F10            float64
APPEARENCES      int64
dtype: object

In [12]:
df.count()

LINK           8325
F1             8325
F2             8325
F3             8325
F4             8325
F5             8325
F6             8325
F7             8325
F8             8325
F9             8325
F10            8325
APPEARENCES    8325
dtype: int64

In [13]:
df_info = df.drop(df.columns[[0]], axis=1)

In [14]:
df_info.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,APPEARENCES
0,25.0,4.0,21.0,5.25,17.0,0.68,97.823529,0.0,0.0,0.0,0
1,31.0,5.0,26.0,5.2,17.0,0.548387,97.823529,1.0,0.051112,0.0,3
2,31.0,6.0,25.0,4.166667,18.0,0.580645,96.611111,1.0,0.041403,0.0,5
3,24.0,5.0,19.0,3.8,15.0,0.625,99.066667,1.0,0.0,0.0,5
4,24.0,5.0,19.0,3.8,15.0,0.625,103.266667,1.0,0.0,0.0,6


In [15]:
df_target = df_info['APPEARENCES']

In [16]:
df_features = df_info.drop(df.columns[[-1]], axis=1)

In [17]:
df_features.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10
0,25.0,4.0,21.0,5.25,17.0,0.68,97.823529,0.0,0.0,0.0
1,31.0,5.0,26.0,5.2,17.0,0.548387,97.823529,1.0,0.051112,0.0
2,31.0,6.0,25.0,4.166667,18.0,0.580645,96.611111,1.0,0.041403,0.0
3,24.0,5.0,19.0,3.8,15.0,0.625,99.066667,1.0,0.0,0.0
4,24.0,5.0,19.0,3.8,15.0,0.625,103.266667,1.0,0.0,0.0


In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.3, random_state=42, shuffle=True)

In [19]:
rfr = RandomForestRegressor(n_estimators = 100, max_depth=10)  

In [20]:
rfr.fit(X_train, y_train)

RandomForestRegressor(max_depth=10)

In [21]:
y_pred = rfr.predict(X_test)

In [22]:
meanSquaredError = metrics.mean_squared_error(y_test, y_pred)

In [23]:
meanSquaredError

52.53873163403363

In [24]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
#sn.heatmap(confusion_matrix, annot=True)

In [25]:
#plt.show()

In [26]:
featureImportances = pd.Series(rfr.feature_importances_).sort_values(ascending=False)
print(featureImportances)

7    0.356151
6    0.268568
3    0.087266
4    0.084585
2    0.068858
5    0.041655
8    0.027713
1    0.026666
0    0.025919
9    0.012617
dtype: float64


In [27]:
joblib.dump(rfr, output_model)

['model.pkl']