In [None]:
import os
from subprocess import call
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,
                             mean_squared_log_error, median_absolute_error)
from sklearn.preprocessing import StandardScaler
from dataset_reader import get_dataset

%load_ext autoreload
%autoreload 2

In [None]:
# Load dataset
df = get_dataset()
df = df.drop('id', axis=1)
print(f"Dataset size: {df.shape[0]} rows")
print(f"Columns: {list(df.columns)}")

In [None]:
# Split dataset for test and train sets
dataset_df = df.dropna()

train_df, test_df = train_test_split(dataset_df, train_size=0.8)
y_variable = "credibility_score"
train_x = train_df.drop(y_variable, axis=1)
test_x = test_df.drop(y_variable, axis=1)
train_y = train_df[y_variable].astype(int)
test_y = test_df[y_variable].astype(int)

print(f"Training data rows: {train_df.shape[0]}")
print(f"Test data rows: {test_df.shape[0]}")

In [None]:
# Train model
model = RandomForestRegressor(n_estimators=10)

scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

train_x_scaled = pd.DataFrame(train_x_scaled, columns=train_x.columns)
model.fit(train_x_scaled, train_y)
prediction = model.predict(test_x_scaled)

In [None]:
# Calculate errors

test_y_np = test_y.to_numpy()
test_mse = mean_squared_error(test_y_np, prediction)
test_msle = mean_squared_log_error(test_y_np, prediction)
test_mdae = median_absolute_error(test_y_np, prediction)
test_mae = mean_absolute_error(test_y_np, prediction)
test_mape = mean_absolute_percentage_error(test_y_np, prediction)
print(f"Mean squared error: {test_mse}")
print(f"Mean squared log error: {test_msle}")
print(f"Median absolute error: {test_mdae}")
print(f"Mean absolute error: {test_mae}")
print(f"Mean absolute percentage error {test_mape}")

In [None]:
# Feature importances

feature_names = list(dataset_df.columns)
feature_names.remove(y_variable)

# estimator = model.estimators_[5]
# def get_feature_importances(estimator):
#     importances = []
#     for i, feature_name in enumerate(feature_names):
#         importances.append(
#             {"importance": estimator.feature_importances_[i], "feature_name": feature_name})
#     importances = sorted(importances, key=lambda x: x["importance"], reverse=True)
#     return importances

# importances = get_feature_importances(estimator)
# importance_msg = "\n".join(f"{i['feature_name']}: {i['importance']:.3f}" for i in importances)
# print(importance_msg)

from sklearn.inspection import permutation_importance

perm_imp = permutation_importance(
    model, test_x_scaled, test_y, n_repeats=10, n_jobs=2
)
forest_importances = pd.Series(perm_imp.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values()
print("IMP:" + str(forest_importances))
print("STD: " + str(perm_imp.importances_std))
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=perm_imp.importances_std, ax=ax)
ax.set_title("Feature importances")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
# Save model
import pickle

# Uncomment to update saved model
#pickle.dump(model, open('model/model.pkl', 'wb'))
#pickle.dump(scaler, open('model/scaler.pkl', 'wb'))

In [None]:
# Visualize tree

estimator = model.estimators_[5]

tmp_dir = os.path.join(os.getcwd(), 'tmp')
if not os.path.isdir(tmp_dir):
    os.mkdir(tmp_dir)
graph_file = os.path.join(tmp_dir, 'tree.dot')
png_file = os.path.join(tmp_dir, 'tree.png')

export_graphviz(estimator, out_file=graph_file, feature_names=feature_names,
                class_names=None, rounded=True, proportion=False, precision=2, filled=True)

call(['dot', '-Tpng', graph_file, '-o', png_file, '-Gdpi=600'])

Image(filename=png_file)