In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn_pandas.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_json('prepped_data.json').sort_index()

In [3]:
df.head()

Unnamed: 0,age,num_comments,score,text,timestamp,weekday_posted,hour_posted,log_score,log_comments
0,300,1588,21720,China Killing Prisoners To Harvest Organs For ...,2019-06-19 11:49:08,2,11,9.985989,7.370237
1,240,402,2661,Muslim family dragged out of Belgian embassy i...,2019-06-19 12:05:31,2,12,7.886457,5.996477
2,660,3320,46977,Women outperform men after Japan medical schoo...,2019-06-19 05:51:44,2,5,10.757413,8.107723
3,360,202,1474,MH17 crash: Investigators 'to charge four with...,2019-06-19 10:50:51,2,10,7.295735,5.308317
4,660,1336,2665,Iranian official calls on world to unite again...,2019-06-19 05:09:15,2,5,7.887959,7.197443


In [4]:
X = df[['text','age','weekday_posted','hour_posted']]
y = df['log_score']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)


In [6]:
X_train.head()

Unnamed: 0,text,age,weekday_posted,hour_posted
4867,Donald Trump Jr. on Sunday claimed CNN is cove...,1440,0,4
1395,"Oxfam warns of the ""worst cholera outbreak in ...",1440,5,11
2819,Exposure to weed killing products increases ri...,1440,3,11
567,Hong Kong protesters demand China be held to a...,660,0,0
1444,Austrian Government Seeks to Eliminate Interne...,180,4,10


In [7]:
type(X_train.loc[4867,'text'])

str

In [8]:
tfidf = TfidfVectorizer()
ohe = OneHotEncoder(drop='first',categories='auto')
ss = StandardScaler()


### 'text' must not be in a list, tfidf is picky

In [9]:
mapper = DataFrameMapper([
    ('text', tfidf),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [10]:
lr = LinearRegression()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
knnr = KNeighborsRegressor()
regressors = [lr,rfr,gbr,knnr]

In [12]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'is:{mean_absolute_error(np.exp(y_test),preds)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)is:25093.8098509658




The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)is:6905.724171103162
The MAE of the GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='a

NOW RUN ON NUM COMMENTS

In [16]:
X = df[['text','age','weekday_posted','hour_posted']]
y2 = df['log_comments']

In [17]:
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, random_state=2019)

In [14]:
mapper = DataFrameMapper([
    ('text', tfidf),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [None]:
for regressor in regressors:
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y2_train)
    preds2 = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {str(regressor)}'
        f'is:{mean_absolute_error(np.exp(y2_test),preds2)}')

The MAE of the LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)is:1461.8648358197213
The MAE of the RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)is:550.1898667897389
