In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn_pandas.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_json('prepped_data.json').sort_index()

In [3]:
df.head()

Unnamed: 0,age,num_comments,score,text,timestamp,weekday_posted,hour_posted,log_score,log_comments
0,300,1588,21720,China Killing Prisoners To Harvest Organs For ...,2019-06-19 11:49:08,2,11,9.985989,7.370237
1,240,402,2661,Muslim family dragged out of Belgian embassy i...,2019-06-19 12:05:31,2,12,7.886457,5.996477
2,660,3320,46977,Women outperform men after Japan medical schoo...,2019-06-19 05:51:44,2,5,10.757413,8.107723
3,360,202,1474,MH17 crash: Investigators 'to charge four with...,2019-06-19 10:50:51,2,10,7.295735,5.308317
4,660,1336,2665,Iranian official calls on world to unite again...,2019-06-19 05:09:15,2,5,7.887959,7.197443


In [4]:
X = df[['text','age','weekday_posted','hour_posted']]
y = df['log_score']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)


In [6]:
X_train.head()

Unnamed: 0,text,age,weekday_posted,hour_posted
4867,Donald Trump Jr. on Sunday claimed CNN is cove...,1440,0,4
1395,"Oxfam warns of the ""worst cholera outbreak in ...",1440,5,11
2819,Exposure to weed killing products increases ri...,1440,3,11
567,Hong Kong protesters demand China be held to a...,660,0,0
1444,Austrian Government Seeks to Eliminate Interne...,180,4,10


In [7]:
type(X_train.loc[4867,'text'])

str

In [8]:
tfidf = TfidfVectorizer()
ohe = OneHotEncoder(drop='first',categories='auto')
ss = StandardScaler()


### 'text' must not be in a list, tfidf is picky

### expectes a 1d array not a 2d array


In [9]:
mapper = DataFrameMapper([
    ('text', tfidf),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [11]:
lr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=100)
gbr = GradientBoostingRegressor(n_estimators=100)
knnr = KNeighborsRegressor()
regressors = [lr, rfr, gbr, knnr]

In [12]:
reg_names = ['LinearRegression',
             'RandomForestRegressor',
             'GradientBoostingRegressor',
             'KNeighborsRegressor'
            ]

In [13]:
LinearRegression()

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
for regressor,name in zip(regressors,reg_names): #zip returns a tuple for each pair it is given
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {name}'
          f' is: {mean_absolute_error(np.exp(y_test),preds)}\n'
          '==================================================================\n'
          '========================\n'
         )

The MAE of the LinearRegression is: 25093.8098509658

The MAE of the RandomForestRegressor is: 6787.4820627081235

The MAE of the GradientBoostingRegressor is: 6675.815934060484

The MAE of the KNeighborsRegressor is: 7045.638051747501



## NOW RUN ON NUM COMMENTS

In [15]:
X = df[['text','age','weekday_posted','hour_posted']]
y2 = df['log_comments']

In [16]:
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, random_state=2019)

In [17]:
mapper = DataFrameMapper([
    ('text', tfidf),
    (['age'], ss),
    (['weekday_posted', 'hour_posted'], ohe),
])

In [18]:
for regressor,name in zip(regressors,reg_names): #zip returns a tuple for each pair it is given
    pipe = Pipeline(steps=[
        ('transform',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y2_train)
    preds = np.exp(pipe.predict(X_test))
    print(f'The MAE of the {name}'
          f' is: {mean_absolute_error(np.exp(y_test),preds)}\n'
        '==================================================================\n'
        '========================\n'
         )

The MAE of the LinearRegression is: 7819.489747955672

The MAE of the RandomForestRegressor is: 7595.623108085903

The MAE of the GradientBoostingRegressor is: 7613.9012763634255

The MAE of the KNeighborsRegressor is: 7592.415851240796

