In [None]:
import os
from subprocess import call
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,
                             mean_squared_log_error, median_absolute_error)
from dataset_reader import get_dataset

%load_ext autoreload
%autoreload 2

In [None]:
# Load dataset
df = get_dataset()
print(f"Dataset size: {df.shape[0]} rows")
df = df.drop('id', axis=1)
print(f"Columns: {list(df.columns)}")

In [None]:
# Split dataset for test and train sets
dataset_df = df.dropna()
y_variable = "credibility_score"
bin_count = 4
random_state = 2
all_features = True

if bin_count and bin_count > 1:
    y_values = dataset_df[y_variable]
    min_y = np.amin(y_values)
    max_y = np.amax(y_values)
    bins_y = np.linspace(start=min_y, stop=max_y, num=bin_count)
    print(f"Bins: {bins_y}")
    binned_y = np.digitize(y, bins_y, right=True)
    train_df, test_df = train_test_split(dataset_df, train_size=0.75, stratify=binned_y, random_state=random_state)
else:
    train_df, test_df = train_test_split(dataset_df, train_size=0.75, random_state=random_state)

if all_features:
    train_x = train_df.drop(y_variable, axis=1)
    test_x = test_df.drop(y_variable, axis=1)
else:
    features = ['name_len', 'tweet_count', 'created_at', 'desc_len', 'desc_words',
                'tweet.length', 'tweet.words', 'tweet.likes', 'tweet.mentions',
                'tweet.sentiment.neg']
    train_x = train_df[features]
    test_x = test_df[features]

train_y = train_df[y_variable].astype(float)
test_y = test_df[y_variable].astype(float)

print(f"Training data rows: {train_df.shape[0]}")
print(f"Test data rows: {test_df.shape[0]}")

In [None]:
# Train model
model = RandomForestRegressor(n_estimators=10, min_samples_leaf=3, random_state=random_state)
#print(model.get_params())

validate = True
randomize_loo_pred = False

if validate:
    from sklearn.model_selection import cross_val_score, LeaveOneOut
    y_np = y.to_numpy()
    y_std = np.std(y_np)
    y_mean = np.sum(y_np) / len(y_np)
    print("Score STD:", y_std)
    print("Score mean:", y_mean)
    errors = []
    loo_df = dataset_df
    for train_indices, test_index in LeaveOneOut().split(loo_df):
        train_c = loo_df.iloc[train_indices]
        train_cx = train_c.drop(y_variable, axis=1)
        train_cy = train_c[y_variable].astype(float)
        test_c = loo_df.iloc[test_index]
        test_cx = test_c.drop(y_variable, axis=1)
        test_cy = test_c[y_variable].astype(float)
        model.fit(train_cx, train_cy)
        if randomize_loo_pred:
            prediction = np.random.normal(y_mean, y_std)
        else:
            prediction = model.predict(train_cx)[0]
        expected = test_cy.iloc[0]
        error = prediction - expected
        errors.append(abs(error))

    mean_error = sum(errors) / len(errors)
    print(f"LOO Mean error: {mean_error:.4f}")
    print(f"LOO Max error: {np.amax(errors):.4f}")

    scores = cross_val_score(model, train_x, train_y, scoring="neg_mean_absolute_error", cv=5)
    scores = [abs(x) for x in scores]
    avg_error = sum(scores) / len(scores)
    print(f"Kfold mean error: {avg_error:.4f}")
    print(f"Kfold Max error: {np.amax(scores):.4f}")

In [None]:
# from sklearn.feature_selection import SelectFromModel
#sel = SelectFromModel(model)
#model = sel.fit(train_x, train_y)
#selected_feat = train_x.columns[(model.get_support())]
#print(selected_feat)
#print(train_x.columns)

In [None]:
# Calculate errors
model.fit(train_x, train_y)
prediction = model.predict(test_x)
test_y_np = test_y.to_numpy()
test_mae = mean_absolute_error(test_y_np, prediction)
test_mdae = median_absolute_error(test_y_np, prediction)
test_mse = mean_squared_error(test_y_np, prediction)
test_msle = mean_squared_log_error(test_y_np, prediction)
test_mape = mean_absolute_percentage_error(test_y_np, prediction)
print(f"Mean absolute error: {test_mae:.4f}")
print(f"Median absolute error: {test_mdae:.4f}")
print(f"Mean squared error: {test_mse:.4f}")
print(f"Mean squared log error: {test_msle:.4f}")
print(f"Mean absolute percentage error {test_mape:.4f}")

In [None]:
# Feature importances

feature_names = list(train_df.columns)
feature_names.remove(y_variable)

# estimator = model.estimators_[5]
# def get_feature_importances(estimator):
#     importances = []
#     for i, feature_name in enumerate(feature_names):
#         importances.append(
#             {"importance": estimator.feature_importances_[i], "feature_name": feature_name})
#     importances = sorted(importances, key=lambda x: x["importance"], reverse=True)
#     return importances

# importances = get_feature_importances(estimator)
# importance_msg = "\n".join(f"{i['feature_name']}: {i['importance']:.3f}" for i in importances)
# print(importance_msg)

from sklearn.inspection import permutation_importance

# perm_imp = permutation_importance(
#     model, test_x_scaled, test_y, n_repeats=10, n_jobs=2
# )
perm_imp = permutation_importance(
    model, test_x, test_y, n_repeats=10, n_jobs=2
)
forest_importances = pd.Series(perm_imp.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values()
print("IMP:" + str(forest_importances))
print("STD: " + str(perm_imp.importances_std))
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=perm_imp.importances_std, ax=ax)
ax.set_title("Feature importances")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
# Save model
import pickle

# Uncomment to update saved model
pickle.dump(model, open('model/model.pkl', 'wb'))

In [None]:
# Visualize tree

estimator = model.estimators_[5]

tmp_dir = os.path.join(os.getcwd(), 'tmp')
if not os.path.isdir(tmp_dir):
    os.mkdir(tmp_dir)
graph_file = os.path.join(tmp_dir, 'tree.dot')
png_file = os.path.join(tmp_dir, 'tree.png')

export_graphviz(estimator, out_file=graph_file, feature_names=feature_names,
                class_names=None, rounded=True, proportion=False, precision=2, filled=True)

call(['dot', '-Tpng', graph_file, '-o', png_file, '-Gdpi=600'])

Image(filename=png_file)