In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
articles = pd.read_csv('../data/processed/articles_final_features.csv')

In [26]:
articles.head()

Unnamed: 0,HEADLINE,HEADLINE_LEN,PUBLISH_DATE,DAY_OF_WEEK,PUBLISH_MONTH,PUBLISHTIMEINSECONDS,WEEKDAY,HOLIDAY,BUREAU,ARTICLE_TYPE,PROMO,CUM_SESSIONS_SOFAR,HEADLINE_GROUP,PROMO_GROUP,SP_INDEX,gtrends_bureau_index,baseline,baseline_diff
0,2 under-the-radar tech stocks to buy in 2022,45,2022-04-12,1,4,22740,Weekday,No,technology-and-telecom,article,These two companies an enjoying explosive top-...,207,0,3,4397.45,18.0,7086.222304,-6879.222304
1,does it matter that gamestop's split will be a...,63,2022-04-12,1,4,32400,Weekday,No,consumer-goods,article,Just how different is it from a cash dividend?,5811,2,1,4397.45,63.0,7086.222304,-1275.222304
2,is amazon stock a buy this month?,34,2022-04-12,1,4,38220,Weekday,No,consumer-goods,article,There are strong reasons to invest in Amazon r...,1637,5,2,4397.45,63.0,7086.222304,-5449.222304
3,2 stocks that cut you a check each month,41,2022-04-12,1,4,33420,Weekday,No,industrials,article,"For some investors, dividend income that flows...",2037,0,6,4397.45,0.0,7086.222304,-5049.222304
4,is amazon or alphabet the better stock split i...,57,2022-04-12,1,4,62220,Weekday,No,technology-and-telecom,article,Both companies dominate much of our digital li...,7682,2,1,4397.45,18.0,7086.222304,595.777696


In [27]:
articles.drop(columns='Unnamed: 0', inplace=True)

KeyError: "['Unnamed: 0'] not found in axis"

In [28]:
articles.head(5)

Unnamed: 0,HEADLINE,HEADLINE_LEN,PUBLISH_DATE,DAY_OF_WEEK,PUBLISH_MONTH,PUBLISHTIMEINSECONDS,WEEKDAY,HOLIDAY,BUREAU,ARTICLE_TYPE,PROMO,CUM_SESSIONS_SOFAR,HEADLINE_GROUP,PROMO_GROUP,SP_INDEX,gtrends_bureau_index,baseline,baseline_diff
0,2 under-the-radar tech stocks to buy in 2022,45,2022-04-12,1,4,22740,Weekday,No,technology-and-telecom,article,These two companies an enjoying explosive top-...,207,0,3,4397.45,18.0,7086.222304,-6879.222304
1,does it matter that gamestop's split will be a...,63,2022-04-12,1,4,32400,Weekday,No,consumer-goods,article,Just how different is it from a cash dividend?,5811,2,1,4397.45,63.0,7086.222304,-1275.222304
2,is amazon stock a buy this month?,34,2022-04-12,1,4,38220,Weekday,No,consumer-goods,article,There are strong reasons to invest in Amazon r...,1637,5,2,4397.45,63.0,7086.222304,-5449.222304
3,2 stocks that cut you a check each month,41,2022-04-12,1,4,33420,Weekday,No,industrials,article,"For some investors, dividend income that flows...",2037,0,6,4397.45,0.0,7086.222304,-5049.222304
4,is amazon or alphabet the better stock split i...,57,2022-04-12,1,4,62220,Weekday,No,technology-and-telecom,article,Both companies dominate much of our digital li...,7682,2,1,4397.45,18.0,7086.222304,595.777696


### What is the Baseline?

In [29]:
articles['CUM_SESSIONS_SOFAR'].mean()

7086.222303995844

In [30]:
articles['baseline'] = articles['CUM_SESSIONS_SOFAR'].mean()

In [31]:
articles['baseline_diff'] = articles['CUM_SESSIONS_SOFAR'] - articles['baseline'] 

In [32]:
np.mean(abs(articles['baseline_diff']))

7617.685621018552

### Set X & y

In [33]:
X = articles[[           
'HEADLINE_LEN',          
'DAY_OF_WEEK',         
'PUBLISH_MONTH',       
'PUBLISHTIMEINSECONDS',
'WEEKDAY',             
'HOLIDAY',             
'BUREAU',              
'ARTICLE_TYPE',                       
'HEADLINE_GROUP',      
'PROMO_GROUP',                       
'SP_INDEX',                       
'gtrends_bureau_index'                                             
                                   ]]

In [34]:
y = articles['CUM_SESSIONS_SOFAR']

### Train, Test, Split

In [52]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=X['BUREAU'])

### One Hot Encode Categorical Variables:
- WEEKDAY
- HOLIDAY
- BUREAU
- ARTICLE TYPE

In [37]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [38]:
# create encoder with a column transformer
smart_encoder = make_column_transformer(
    (OneHotEncoder(drop = 'if_binary'), ['WEEKDAY', 'HOLIDAY', 'BUREAU', 'ARTICLE_TYPE']), remainder='passthrough'
                       )                     

In [39]:
# fit and transform on X train
X_train_ohe = smart_encoder.fit_transform(X_train)

In [40]:
X_train_ohe

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 6.00000e+00,
        4.41013e+03, 1.90000e+01],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        3.36047e+03, 1.80000e+01],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 5.00000e+00,
        3.53422e+03, 3.10000e+01],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        3.37285e+03, 4.20000e+01],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.44344e+03, 2.10000e+01],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        4.39794e+03, 4.20000e+01]])

In [41]:
# save as df
pd.DataFrame(X_train_ohe, columns=smart_encoder.get_feature_names())

Unnamed: 0,onehotencoder__x0_Weekend,onehotencoder__x1_Yes,onehotencoder__x2_cannabis,onehotencoder__x2_consumer-goods,onehotencoder__x2_cryptocurrency,onehotencoder__x2_energy-materials-and-utilities,onehotencoder__x2_financials,onehotencoder__x2_health-care,onehotencoder__x2_industrials,onehotencoder__x2_investment-planning,...,onehotencoder__x3_transcript,onehotencoder__x3_video,HEADLINE_LEN,DAY_OF_WEEK,PUBLISH_MONTH,PUBLISHTIMEINSECONDS,HEADLINE_GROUP,PROMO_GROUP,SP_INDEX,gtrends_bureau_index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,73.0,0.0,1.0,33300.0,0.0,6.0,4410.13,19.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,45.0,0.0,8.0,54780.0,2.0,1.0,3360.47,18.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,76.0,0.0,10.0,65640.0,3.0,5.0,3534.22,31.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,58.0,2.0,2.0,41340.0,3.0,3.0,4225.50,20.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,33.0,5.0,1.0,32100.0,3.0,1.0,4662.85,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,62.0,4.0,10.0,25200.0,3.0,1.0,3477.13,28.0
17320,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,53.0,3.0,2.0,5400.0,9.0,4.0,4504.08,44.0
17321,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,40.0,5.0,8.0,32400.0,0.0,1.0,3372.85,42.0
17322,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,60.0,2.0,11.0,48600.0,1.0,0.0,3443.44,21.0


In [42]:
# only transform X Test
X_test_ohe = smart_encoder.transform(X_test)

In [43]:
articles.head()

Unnamed: 0,HEADLINE,HEADLINE_LEN,PUBLISH_DATE,DAY_OF_WEEK,PUBLISH_MONTH,PUBLISHTIMEINSECONDS,WEEKDAY,HOLIDAY,BUREAU,ARTICLE_TYPE,PROMO,CUM_SESSIONS_SOFAR,HEADLINE_GROUP,PROMO_GROUP,SP_INDEX,gtrends_bureau_index,baseline,baseline_diff
0,2 under-the-radar tech stocks to buy in 2022,45,2022-04-12,1,4,22740,Weekday,No,technology-and-telecom,article,These two companies an enjoying explosive top-...,207,0,3,4397.45,18.0,7086.222304,-6879.222304
1,does it matter that gamestop's split will be a...,63,2022-04-12,1,4,32400,Weekday,No,consumer-goods,article,Just how different is it from a cash dividend?,5811,2,1,4397.45,63.0,7086.222304,-1275.222304
2,is amazon stock a buy this month?,34,2022-04-12,1,4,38220,Weekday,No,consumer-goods,article,There are strong reasons to invest in Amazon r...,1637,5,2,4397.45,63.0,7086.222304,-5449.222304
3,2 stocks that cut you a check each month,41,2022-04-12,1,4,33420,Weekday,No,industrials,article,"For some investors, dividend income that flows...",2037,0,6,4397.45,0.0,7086.222304,-5049.222304
4,is amazon or alphabet the better stock split i...,57,2022-04-12,1,4,62220,Weekday,No,technology-and-telecom,article,Both companies dominate much of our digital li...,7682,2,1,4397.45,18.0,7086.222304,595.777696


### Linear Regression

In [44]:
from sklearn.linear_model import LinearRegression

In [45]:
# instantiate
lr = LinearRegression()

In [46]:
# fit 
lr.fit(X_train_ohe, y_train)

LinearRegression()

### Feature Importance

In [47]:
lr.coef_  ### how to map these to the features themselves??

array([ 3.77167211e+03,  5.70546974e+03,  7.17299350e+02, -2.42567933e+03,
        3.67577552e+03, -8.43647130e+02, -9.31581043e+02,  1.50440825e+03,
        1.02297951e+03,  2.72421081e+03, -3.24124259e+03, -2.59339023e+03,
        3.90866885e+02,  5.16512480e+01,  3.19624270e+03, -2.76431052e+03,
        8.65234810e+03, -6.01450169e+03, -3.12142983e+03,  4.48869285e+01,
       -1.87599094e+02, -2.80846570e+02, -1.98126136e-02, -5.71298445e+01,
        2.18185919e+01, -5.35613953e+00,  4.66947673e+00])

In [50]:

importance_df = pd.DataFrame({'importance':lr.coef_,
                              'features':smart_encoder.get_feature_names()})
#importance_df.sort_values(by = dtree.feature_importances_, ascending = False)
importance_df

Unnamed: 0,importance,features
0,3771.672107,onehotencoder__x0_Weekend
1,5705.46974,onehotencoder__x1_Yes
2,717.29935,onehotencoder__x2_cannabis
3,-2425.679328,onehotencoder__x2_consumer-goods
4,3675.775516,onehotencoder__x2_cryptocurrency
5,-843.64713,onehotencoder__x2_energy-materials-and-utilities
6,-931.581043,onehotencoder__x2_financials
7,1504.408254,onehotencoder__x2_health-care
8,1022.97951,onehotencoder__x2_industrials
9,2724.21081,onehotencoder__x2_investment-planning


In [55]:
lr.intercept_

27186.303331491297

### Scoring

In [58]:
from sklearn.model_selection import cross_val_score

In [71]:
cross_val_score(lr, X_train_ohe, y_train, scoring='neg_mean_squared_error')
# cv controls the amount of folds
# this is returning the MSE- ignore the negative. 

array([-1.70256327e+08, -1.73747147e+08, -1.90860082e+08, -1.76389425e+08,
       -1.62910615e+08])

In [69]:
np.mean(cross_val_score(lr, X_train_ohe, y_train, scoring='neg_mean_squared_error'))

-174832719.05332547

In [70]:
np.sqrt(abs(np.mean(cross_val_score(lr, X_train_ohe, y_train, scoring='neg_mean_squared_error'))))

13222.43241817955

In [None]:
# yikes, off by 13k sessions on average! way worse than the baseline of 7k. 

In [57]:
preds = lr.predict(X_test_ohe)

In [58]:
preds

array([ 5551.03819275,  2725.80954924,  4773.08244803, ...,
       11831.0633931 ,  2012.9118786 , 12073.70851562])

In [None]:
# feataure permutation importance- shuffles features and helps figure out which features matter 
# and which don't
# sklearn has a feature importance library
# partial dependence can also help you evaluate your features

In [60]:
X_test['actual_sessions'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['actual_sessions'] = y_test


In [62]:
X_test['predicted_Sessions'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['predicted_Sessions'] = preds


In [63]:
X_test

Unnamed: 0,HEADLINE_LEN,DAY_OF_WEEK,PUBLISH_MONTH,PUBLISHTIMEINSECONDS,WEEKDAY,HOLIDAY,BUREAU,ARTICLE_TYPE,HEADLINE_GROUP,PROMO_GROUP,SP_INDEX,gtrends_bureau_index,actual_sessions,predicted_Sessions
8439,53,0,9,61140,Weekday,No,technology-and-telecom,news brief,3,5,3351.60,24.0,794,5551.038193
6777,58,3,8,68400,Weekday,No,health-care,transcript,7,0,3373.43,17.0,586,2725.809549
11191,47,4,9,59640,Weekday,No,cannabis,news brief,3,1,3340.97,6.0,5221,4773.082448
3157,51,2,11,21060,Weekday,No,health-care,article,0,1,3443.44,32.0,28030,12022.120272
6971,82,6,8,29220,Weekend,No,health-care,article,3,6,3508.01,21.0,9305,16656.252824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12266,49,1,10,60060,Weekday,No,industrials,10% promise series,2,1,3360.95,49.0,4702,8409.029323
6770,75,0,8,82800,Weekday,No,health-care,transcript,7,0,3360.47,17.0,109,3835.798547
14234,44,1,1,21600,Weekday,No,cryptocurrency,article,3,1,4356.45,38.0,3877,11831.063393
10708,39,2,9,36060,Weekday,No,consumer-goods,news brief,3,1,3398.96,51.0,597,2012.911879


### Try again, with fewer features

In [53]:
X2 = articles[[           
'HEADLINE_LEN',          
'DAY_OF_WEEK',         
'PUBLISH_MONTH',       
'PUBLISHTIMEINSECONDS',
'WEEKDAY',             
'HOLIDAY',             
'BUREAU',              
'ARTICLE_TYPE',                       
'HEADLINE_GROUP',      
'PROMO_GROUP'                                                                                      
                                   ]]
y = articles['CUM_SESSIONS_SOFAR']

In [54]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=42, stratify=X['BUREAU'])

In [55]:
X_train_ohe2 = smart_encoder.fit_transform(X_train2)
X_test_ohe2 = smart_encoder.transform(X_test)

In [56]:
# instantiate
lr2 = LinearRegression()

In [57]:
lr2.fit(X_train_ohe2, y_train2)

LinearRegression()

In [59]:
cross_val_score(lr2, X_train_ohe2, y_train2, scoring='neg_mean_squared_error')

array([-1.70256327e+08, -1.73747147e+08, -1.90860082e+08, -1.76389425e+08,
       -1.62910615e+08])

In [60]:
np.sqrt(abs(np.mean(cross_val_score(lr2, X_train_ohe2, y_train2, scoring='neg_mean_squared_error'))))

13222.43241817955

- boo, still off by 13k, which is worse than baseline!
- almost the same score as with stock market index and gtrends data

### Try to build a pipeline

In [62]:
from sklearn.pipeline import Pipeline

In [66]:
lr_pipe= Pipeline([('smartencoder', smart_encoder), #first, create poly features
                    ('model', lr)]) #build model on poly features

# this now acts like other estimators

In [73]:
np.mean(np.sqrt(abs((cross_val_score(lr_pipe, X_train2, y_train2, scoring='neg_mean_squared_error')))))

13217.91463648642

### Try KNN Regressor

In [87]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error 


In [77]:
knnr = KNeighborsRegressor()

In [79]:
sscaler = StandardScaler

In [81]:
sscaler = StandardScaler()
X_train_scaled = sscaler.fit_transform(X_train_ohe)
X_test_scaled =sscaler.transform(X_test_ohe)

In [82]:
knnr.fit(X_train_scaled, y_train)

KNeighborsRegressor()

In [84]:
preds = knnr.predict(X_test_scaled) #make prediction on test set

In [89]:
error = np.sqrt(mean_squared_error(y_test,preds))

In [90]:
error 

13047.353250921458

- still off by 13k sessions.
- I'm starting to think we might need better data for this model, or maybe to look at it as a time series and predict sessions by day_X

### Next Steps
- I think this might work better as a time series, but I would completely need to restructure my data set to be one record per article and day, instead of just one record per article. 
- We are also missing chunks of data in the data set, due to a data engineering error a few weeks ago, will re run this when the data have been backfilled
- I'm thinkg I should only include articles that have 'baked' 30 days or something- that might be affecting the results since some of these articles didn't have full time to gain sessions. 