In [1]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM, GlobalAveragePooling1D
import pickle

## Data wrangling

In [2]:
df = pd.read_csv('data/raw_news.csv', header=0)
df.head()

Unnamed: 0,type,headline
0,,stocks that hit 52-week highs on friday
1,,stocks that hit 52-week highs on wednesday
2,,71 biggest movers from friday
3,,46 stocks moving in friday's mid-day session
4,,b of a securities maintains neutral on agilent...


In [3]:
df.shape

(1048575, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   type      0 non-null        float64
 1   headline  1048575 non-null  object 
dtypes: float64(1), object(1)
memory usage: 16.0+ MB


In [5]:
df['headline'] = df['headline'].str.lower()
df.head(10)

Unnamed: 0,type,headline
0,,stocks that hit 52-week highs on friday
1,,stocks that hit 52-week highs on wednesday
2,,71 biggest movers from friday
3,,46 stocks moving in friday's mid-day session
4,,b of a securities maintains neutral on agilent...
5,,"cfra maintains hold on agilent technologies, l..."
6,,"ubs maintains neutral on agilent technologies,..."
7,,agilent technologies shares are trading higher...
8,,wells fargo maintains overweight on agilent te...
9,,10 biggest price target changes for friday


In [6]:
df.loc[df['headline'].str.contains('upgrades | downgrades | upgraded | downgraded | price target | pt | coverage'), 'type'] = 'analyst_action'
df.loc[df['headline'].str.contains('eps | earnings | beats'), 'type'] = 'earnings'
df.loc[df['headline'].str.contains('guidance estimates | sales growth | guidance | guides | revenue'), 'type'] = 'company_guidance'
df.loc[df['headline'].str.contains('civil action | repurchase | buyback | dividend | partnership | agreement | contract | settlement | fda | conference | launches | unveils | reports | announces | corporate'), 'type'] = 'corporate_action'
df.loc[df['headline'].str.contains('option | options | covered call | put/call | derivatives'), 'type'] = 'options'
df.loc[df['headline'].str.contains('renewal | purchase | merger | acquire'), 'type'] = 'merger_acquisition'
df.loc[df['headline'].str.contains('session | week | not seeing any news | watch | moving | movers | volatility | shares | why | takeaways | talks | says | eye | several | how | shows | outlook | global | update | results | 52-week'), 'type'] = 'neutral'

In [7]:
df['type'].value_counts()

neutral               246174
corporate_action      150656
analyst_action        123136
earnings               48158
options                17875
merger_acquisition     16471
company_guidance       14758
Name: type, dtype: int64

In [8]:
df['type'].unique()

array(['neutral', 'analyst_action', 'earnings', nan, 'corporate_action',
       'merger_acquisition', 'company_guidance', 'options'], dtype=object)

In [9]:
# save a copy
df.to_csv('data/news.csv', index=0)

In [10]:
df_news_type = pd.read_csv('data/news.csv')
df_news_type.head()

Unnamed: 0,type,headline
0,neutral,stocks that hit 52-week highs on friday
1,neutral,stocks that hit 52-week highs on wednesday
2,neutral,71 biggest movers from friday
3,neutral,46 stocks moving in friday's mid-day session
4,analyst_action,b of a securities maintains neutral on agilent...


In [11]:
df_news_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   type      617228 non-null   object
 1   headline  1048575 non-null  object
dtypes: object(2)
memory usage: 16.0+ MB


In [12]:
df_news_type.shape

(1048575, 2)

In [13]:
df_news_type.isnull().sum()

type        431347
headline         0
dtype: int64

In [14]:
df_news_type.dropna(subset=['type'], inplace=True)

In [15]:
df_news_type.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 617228 entries, 0 to 1048574
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   type      617228 non-null  object
 1   headline  617228 non-null  object
dtypes: object(2)
memory usage: 14.1+ MB


In [16]:
# create mapper
mapper = {}
for i, news in enumerate(df_news_type['type'].unique()):
    mapper[news] = i

print(mapper)

{'neutral': 0, 'analyst_action': 1, 'earnings': 2, 'corporate_action': 3, 'merger_acquisition': 4, 'company_guidance': 5, 'options': 6}


In [17]:
# create sentiment dict for later use
sentiment_dict = {}
for k, v in mapper.items():
    sentiment_dict[v] = k

print(sentiment_dict)

{0: 'neutral', 1: 'analyst_action', 2: 'earnings', 3: 'corporate_action', 4: 'merger_acquisition', 5: 'company_guidance', 6: 'options'}


In [18]:
df_news_type['type'] = df_news_type['type'].map(mapper)
df_news_type

Unnamed: 0,type,headline
0,0,stocks that hit 52-week highs on friday
1,0,stocks that hit 52-week highs on wednesday
2,0,71 biggest movers from friday
3,0,46 stocks moving in friday's mid-day session
4,1,b of a securities maintains neutral on agilent...
...,...,...
1048568,3,morgan stanley downgrades qlogic corporation t...
1048571,3,"update: qlogic announces restructuring plan, t..."
1048572,3,"qlogic announces restructuring plan, to cut jo..."
1048573,3,qlogic corporation reports q4 eps of $0.17 vs ...


In [19]:
count = pd.concat([df_news_type['type'].value_counts(), df_news_type['type'].value_counts() / len(df_news_type['type']) * 100], axis=1)
count.columns = ['count', '%']
count.sort_values(by='%', ascending=False)

Unnamed: 0,count,%
0,246174,39.883803
3,150656,24.408484
1,123136,19.94984
2,48158,7.802303
6,17875,2.896012
4,16471,2.668544
5,14758,2.391013
