# Import required libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Import data

In [2]:
df = pd.read_csv("final_embedded.csv", index_col=0)
df = df.join(pd.get_dummies(df['target'])).drop('target', axis=1) #one hot encoding subreddits
df.head()

Unnamed: 0,score,num_comments,sin_hour,cos_hour,cos_month,sin_month,cos_weekday,sin_weekday,emb_text_0,emb_text_1,...,0,1,2,3,4,5,6,7,8,9
0,1,12,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.010188,0.002035,...,0,0,0,0,0,0,0,0,1,0
1,2,5,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.006287,-0.003861,...,0,0,0,0,0,0,0,0,1,0
2,1,1,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.008691,-0.00115,...,0,0,0,0,0,0,0,0,1,0
3,4,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.007517,-0.002655,...,0,0,0,0,0,0,0,0,1,0
4,0,9,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.003354,-0.014315,...,0,0,0,0,0,0,0,0,1,0


Division into training and test samples

In [3]:
X, y = df.drop(columns=['score']), df['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Regression

### LinearRegression

In [4]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('LinearRegression', LinearRegression())
    ]
)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("MSE", mean_squared_error(y_pred, y_test))
print("R^2", r2_score(y_pred, y_test))

MSE 55748.138301878964
R^2 0.5284166972350762
Wall time: 418 ms


### RandomForestRegressor

In [5]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('RandomForestRegressor', RandomForestRegressor())
    ]
)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("MSE", mean_squared_error(y_pred, y_test))
print("R^2", r2_score(y_pred, y_test))

MSE 68523.05217399725
R^2 0.5833655542170963
Wall time: 10min 21s


### GradientBoostingRegressor

In [6]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('GradientBoostingRegressor', GradientBoostingRegressor())
    ]
)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("MSE", mean_squared_error(y_pred, y_test))
print("R^2", r2_score(y_pred, y_test))

MSE 72793.40390771808
R^2 0.5140536772712546
Wall time: 2min 1s
