In [1]:
import pandas as pd
import os
from subprocess import call
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,
                             mean_squared_log_error, median_absolute_error)
from sklearn.preprocessing import StandardScaler
from dataset_reader import get_validated_dataset

In [2]:
# Load dataset
df = get_validated_dataset()
print(f"Dataset size: {df.shape[0]} rows")
print(f"Columns: {list(df.columns)}")

Filtered out 268 profiles
Dataset size: 5130 rows
Columns: ['verified', 'name_len', 'followers_count', 'following_count', 'tweet_count', 'listed_count', 'has_url', 'created_at', 'desc_len', 'desc_words', 'tweet.length_mean', 'tweet.words_mean', 'tweet.retweets_mean', 'tweet.replies_mean', 'tweet.likes_mean', 'tweet.quotes_mean', 'tweet.mentions_mean', 'tweet.retweets_percentage']


In [3]:
# Split dataset for test and train sets
dataset_df = df.dropna()

train_df, test_df = train_test_split(dataset_df, train_size=0.75)
y_variable = "followers_count"
train_x = train_df.drop(y_variable, axis=1)
test_x = test_df.drop(y_variable, axis=1)
train_y = train_df[y_variable].astype(int)
test_y = test_df[y_variable].astype(int)

print(f"Training data rows: {train_df.shape[0]}")
print(f"Test data rows: {test_df.shape[0]}")

Training data rows: 3847
Test data rows: 1283


In [4]:
# Train model
model = RandomForestClassifier(n_estimators=10)

scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

model.fit(train_x_scaled, train_y)
prediction = model.predict(test_x_scaled)
test_y_np = test_y.to_numpy()
test_mse = mean_squared_error(test_y_np, prediction)
test_msle = mean_squared_log_error(test_y_np, prediction)
test_mdae = median_absolute_error(test_y_np, prediction)
test_mae = mean_absolute_error(test_y_np, prediction)
test_mape = mean_absolute_percentage_error(test_y_np, prediction)

print(f"Mean squared error: {test_mse}")
print(f"Mean squared log error: {test_msle}")
print(f"Median absolute error: {test_mdae}")
print(f"Mean absolute error: {test_mae}")
print(f"Mean absolute percentage error {test_mape}")

Mean squared error: 6909703276667.244
Mean squared log error: 6.2909361638811205
Median absolute error: 8738.0
Mean absolute error: 468052.2416212003
Mean absolute percentage error 1.7157499472266353
