## Import Libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Summary of Notes/ To Dos:
- fix dates
- remove outliers, based on sessions
- fill nulls for pitch? or just drop?
- add headline length column
- ad some date columns
- drop lower count bureaus:
    - the-business
    - research
    - the ascent  
- drop lower count article types
- don't use author, don't use collection
- publish date and time seem to influence sessions
- filter to just article type = article and drop this as a feature? or include other article types?
- what to do with tags, where there is very few samples of a tag? weighting?
- handling headline and promo text- should I encode those first and then use them as features? or cluster them? or use bag of works? something to discuss with Jacob and Ben. 
- don't include tickers for now- too much scope for this project, but would be interesting in future iteration

# Load in Data Set

In [4]:
articles = pd.read_csv('../data/raw/articles.csv', parse_dates=True)

### Inspect Data Set

In [5]:
# copy df 
articles_df = articles.copy()

### Dates

In [6]:
# Fix publish date column
articles_df['PUBLISH_DATE'] = pd.to_datetime(articles_df['PUBLISH_DATE'])

In [7]:
# create day of week column
articles_df['DAY_OF_WEEK']= articles_df['PUBLISH_DATE'].dt.weekday

In [8]:
# create month column
articles_df['PUBLISH_MONTH']= articles_df['PUBLISH_DATE'].dt.month 

### Remove outliers

In [9]:
# set upper and lower bounds
upper = articles_df['CUM_SESSIONS_SOFAR'].mean() + 4*articles_df['CUM_SESSIONS_SOFAR'].std()
lower= articles_df['CUM_SESSIONS_SOFAR'].mean() - 4* articles_df['CUM_SESSIONS_SOFAR'].std()

In [10]:
articles_df= articles_df[(articles_df['CUM_SESSIONS_SOFAR'] < upper) & (articles_df['CUM_SESSIONS_SOFAR'] > lower)]

### Bureau

In [None]:
### drop bureaus - drop_these = ['the ascent', 'the-business', 'research', 'unknown bureau', 'other']

In [11]:
articles_df = articles_df[~articles_df['BUREAU'].isin(['the ascent', 'the-business', 'research', 'unknown bureau', 'other'])]


In [14]:
# replace marijuana with cannabis
articles_df['BUREAU'] = np.where(articles_df['BUREAU'] =='marijuana', 'cannabis', articles_df['BUREAU'])

In [25]:
### check
articles_df['BUREAU'].value_counts()

technology-and-telecom            5366
consumer-goods                    4480
health-care                       2977
industrials                       2359
investment-planning               2040
financials                        1978
energy-materials-and-utilities    1513
cryptocurrency                     754
markets                            581
real estate                        534
cannabis                           517
Name: BUREAU, dtype: int64

### Article Type

In [16]:
articles_df['ARTICLE_TYPE'].value_counts()

article                 12590
10% promise series       4257
transcript               3311
news brief               1442
video                    1242
roundtable                257
news                        1
unknown article type        1
Name: ARTICLE_TYPE, dtype: int64

In [18]:
# Drop article types - ['unknown article type' , 'news']

In [19]:
articles_df = articles_df[~articles_df['ARTICLE_TYPE'].isin(['unknown article type' , 'news'])]


### Add Headline Length

In [20]:
articles_df['HEADLINE_LEN'] = articles_df.HEADLINE.apply(len)

### Check for Nulls again

In [21]:
articles_df.isnull().sum()


HEADLINE                0
PUBLISH_DATE            0
PUBLISHTIMEINSECONDS    0
WEEKDAY                 0
HOLIDAY                 0
BUREAU                  0
AUTHOR                  0
ARTICLE_TYPE            0
COLLECTION              0
TICKERS                 0
PROMO                   0
PITCH                   4
DUO                     0
CHARTICLE               0
NON_PAID_VIDEO          0
CUM_SESSIONS_SOFAR      0
SES                     0
SE_VALUE                0
COST                    0
TOTAL_VALUE             0
MARGIN                  0
PROFITABLE              0
DAY_OF_WEEK             0
PUBLISH_MONTH           0
HEADLINE_LEN            0
dtype: int64

### Choose final columns and export to processed folders

In [22]:
final = articles_df[[
'HEADLINE', 
'HEADLINE_LEN',
'PUBLISH_DATE', 
'DAY_OF_WEEK',             
'PUBLISH_MONTH', 
'PUBLISHTIMEINSECONDS',    
'WEEKDAY',                 
'HOLIDAY',                 
'BUREAU',                                  
'ARTICLE_TYPE',                            
'PROMO',                                              
'CUM_SESSIONS_SOFAR'                  
]]                    
                               

In [23]:
final

Unnamed: 0,HEADLINE,HEADLINE_LEN,PUBLISH_DATE,DAY_OF_WEEK,PUBLISH_MONTH,PUBLISHTIMEINSECONDS,WEEKDAY,HOLIDAY,BUREAU,ARTICLE_TYPE,PROMO,CUM_SESSIONS_SOFAR
0,2 under-the-radar tech stocks to buy in 2022,45,2022-04-12,1,4,22740,Weekday,No,technology-and-telecom,article,These two companies an enjoying explosive top-...,207
1,does it matter that gamestop's split will be a...,63,2022-04-12,1,4,32400,Weekday,No,consumer-goods,article,Just how different is it from a cash dividend?,5811
2,is amazon stock a buy this month?,34,2022-04-12,1,4,38220,Weekday,No,consumer-goods,article,There are strong reasons to invest in Amazon r...,1637
3,2 stocks that cut you a check each month,41,2022-04-12,1,4,33420,Weekday,No,industrials,article,"For some investors, dividend income that flows...",2037
4,is amazon or alphabet the better stock split i...,57,2022-04-12,1,4,62220,Weekday,No,technology-and-telecom,article,Both companies dominate much of our digital li...,7682
...,...,...,...,...,...,...,...,...,...,...,...,...
24940,why affimed stock is heating up today,38,2022-04-11,0,4,44520,Weekday,No,health-care,10% promise series,The biotech's experimental blood cancer treatm...,318
24941,3 beaten-down growth stocks -- can they recover?,49,2022-04-11,0,4,23940,Weekday,No,technology-and-telecom,article,These former stock market darlings can be purc...,4138
24942,why shares of dingdong are volatile today,42,2022-04-11,0,4,47880,Weekday,No,consumer-goods,10% promise series,Lockdowns in China might create a beneficial e...,96
24943,this cybersecurity stock could just be getting...,55,2022-04-11,0,4,33480,Weekday,No,technology-and-telecom,article,CrowdStrike's stock has cratered during the te...,942


In [26]:
final.to_csv('../data/processed/articles_munged.csv')