In [1]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM, GlobalAveragePooling1D
import pickle

In [10]:
df_news_type = pd.read_csv('data/news.csv')
df_news_type.head()

Unnamed: 0,type,headline
0,neutral,stocks that hit 52-week highs on friday
1,neutral,stocks that hit 52-week highs on wednesday
2,neutral,71 biggest movers from friday
3,neutral,46 stocks moving in friday's mid-day session
4,analyst_action,b of a securities maintains neutral on agilent...


In [14]:
df_news_type.dropna(subset=['type'], inplace=True)

In [15]:
df_news_type.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 617228 entries, 0 to 1048574
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   type      617228 non-null  object
 1   headline  617228 non-null  object
dtypes: object(2)
memory usage: 14.1+ MB


In [16]:
# create mapper
mapper = {}
for i, news in enumerate(df_news_type['type'].unique()):
    mapper[news] = i

print(mapper)

{'neutral': 0, 'analyst_action': 1, 'earnings': 2, 'corporate_action': 3, 'merger_acquisition': 4, 'company_guidance': 5, 'options': 6}


In [17]:
# create sentiment dict for later use
sentiment_dict = {}
for k, v in mapper.items():
    sentiment_dict[v] = k

print(sentiment_dict)

{0: 'neutral', 1: 'analyst_action', 2: 'earnings', 3: 'corporate_action', 4: 'merger_acquisition', 5: 'company_guidance', 6: 'options'}


In [18]:
df_news_type['type'] = df_news_type['type'].map(mapper)
df_news_type

Unnamed: 0,type,headline
0,0,stocks that hit 52-week highs on friday
1,0,stocks that hit 52-week highs on wednesday
2,0,71 biggest movers from friday
3,0,46 stocks moving in friday's mid-day session
4,1,b of a securities maintains neutral on agilent...
...,...,...
1048568,3,morgan stanley downgrades qlogic corporation t...
1048571,3,"update: qlogic announces restructuring plan, t..."
1048572,3,"qlogic announces restructuring plan, to cut jo..."
1048573,3,qlogic corporation reports q4 eps of $0.17 vs ...


In [19]:
count = pd.concat([df_news_type['type'].value_counts(), df_news_type['type'].value_counts() / len(df_news_type['type']) * 100], axis=1)
count.columns = ['count', '%']
count.sort_values(by='%', ascending=False)

Unnamed: 0,count,%
0,246174,39.883803
3,150656,24.408484
1,123136,19.94984
2,48158,7.802303
6,17875,2.896012
4,16471,2.668544
5,14758,2.391013


## Data preprocessing

In [20]:
# train test split
train, test = train_test_split(df_news_type, test_size=0.2)

In [21]:
X_train = np.array(train['headline'].tolist().copy())
y_train = keras.utils.to_categorical(train['type'].astype('int64'))

X_test = np.array(test['headline'].tolist().copy())
y_test = keras.utils.to_categorical(test['type'].astype('int64'))

In [22]:
# declare variables for preprocessing
vocab_size = 1000
embedding_dim = 16
max_length = 142
trunc_type = 'post'
padding_type = 'post'
oov_token = '<OOV>'

# create tokenizer object
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)
# tokenizer.word_index

In [23]:
# create sequence
X_tr_seq = tokenizer.texts_to_sequences(X_train)
X_tr_pad = pad_sequences(X_tr_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [24]:
# preprocess labels
X_te_seq = tokenizer.texts_to_sequences(X_test)
X_te_pad = pad_sequences(X_te_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [25]:
# save tokenizer on system
with open('models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Modeling

In [26]:
# build neural network
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 142, 16)           16000     
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                544       
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 231       
Total params: 16,775
Trainable params: 16,775
Non-trainable params: 0
_________________________________________________________________


In [27]:
# fit and train model
num_epochs = 6
history = model.fit(X_tr_pad, y_train, epochs=num_epochs, validation_data=(X_te_pad, y_test))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [28]:
# testing
phrase1 = 'Pfizer receives FDA approval for its COVID-19 vaccination',
phrase2 = 'Tesla expects to deliver 350,000 cars in the next quarter, estimates revenue increased by 5%',
phrase3 = 'Intel plans to acquire Disney to expand its consumer sector',
phrase4 = 'Apple announces a share repurchase program for up to $25 million',
phrase5 = 'Microsoft ex-CEO Bill Gates says he loves China',
phrase6 = 'Elon Musk will go to the moon with his dogecoin',
phrase7 = 'Morgan Stanley analyst upgrades Tesla to Buy, raises price target to $4,000',
phrase8 = 'AMD Q4-2025 Adj. EPS $1.89 beats $0.85 estimate, sales $3.37B beat $1.33B estimate',
phrase9 = 'Notable put options activity in Gamestop',

phrase_list = [phrase1, phrase2, phrase3, phrase4, phrase5, phrase6, phrase7, phrase8, phrase9]

for phrase in phrase_list:
    X_te_seq = tokenizer.texts_to_sequences(phrase)
    X_te_pad = pad_sequences(X_te_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

    prediction = model.predict(X_te_pad)
    classes = np.argmax(prediction, axis=-1)

    print(f'{phrase}: {sentiment_dict[int(classes[0])]}')

('Pfizer receives FDA approval for its COVID-19 vaccination',): corporate_action
('Tesla expects to deliver 350,000 cars in the next quarter, estimates revenue increased by 5%',): company_guidance
('Intel plans to acquire Disney to expand its consumer sector',): merger_acquisition
('Apple announces a share repurchase program for up to $25 million',): corporate_action
('Microsoft ex-CEO Bill Gates says he loves China',): neutral
('Elon Musk will go to the moon with his dogecoin',): neutral
('Morgan Stanley analyst upgrades Tesla to Buy, raises price target to $4,000',): analyst_action
('AMD Q4-2025 Adj. EPS $1.89 beats $0.85 estimate, sales $3.37B beat $1.33B estimate',): earnings
('Notable put options activity in Gamestop',): options


In [29]:
# save model
model.save('models/title_model.h5')

## Predicting

In [30]:
# retrieve data through web scraping
base_url = 'https://finviz.com/quote.ashx?t='
ticker = 'TSLA'
url = base_url + ticker

req = Request(url=url, headers={'user-agent': 'Mozilla/5.0'})
response = urlopen(req)

html = bs(response)
news_table = html.find(id='news-table')
news_row = news_table.findAll('tr')

In [31]:
df_ticker_news = pd.DataFrame(columns=['time', 'title'])

In [32]:
for i, row in enumerate(news_row):
    a_text = row.a.text
    td_text = row.td.text
    df_ticker_news = df_ticker_news.append({'time': td_text, 'title': a_text}, ignore_index=True)

In [33]:
df_ticker_news

Unnamed: 0,time,title
0,Jul-13-21 06:31PM,Dow Jones Futures Await Fed Chief Powell As In...
1,06:00PM,Tesla Stock Is Falling. Goldman Sachs Is Raisi...
2,03:16PM,Forget Tesla fundamentals and watch how the te...
3,03:00PM,Why Tesla Stock Just Gave Back Half of Yesterd...
4,02:00PM,Is Li Auto Stock A Buy Now? Shares Jump 45% In...
...,...,...
95,11:29AM,Why Tesla Stock Stumbled on Thursday
96,10:20AM,XPeng President on Tesla: Chinese players are ...
97,10:10AM,Teslas June China Delivery Numbers Were Good. ...
98,09:29AM,"Teslas June China Delivery Numbers Were Good, ..."


In [34]:
# load tokenizer
with open('models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# load model
model = tf.keras.models.load_model('models/title_model.h5')

In [35]:
# declare variables for preprocessing
vocab_size = 1000
max_length = 142
trunc_type = 'post'
padding_type = 'post'
oov_token = '<OOV>'

def preprocess_text(text):
    sequences = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    return padded

In [36]:
padded = preprocess_text(df_ticker_news['title'])
prediction = model.predict(padded)
prediction[:5]

array([[2.4794487e-02, 7.2108355e-07, 9.7438395e-01, 2.8425106e-04,
        4.8808669e-04, 2.6510847e-05, 2.2098440e-05],
       [1.4058930e-04, 6.8562365e-08, 9.9981636e-01, 8.2254683e-06,
        3.2329775e-05, 1.6135360e-06, 7.5198153e-07],
       [9.9997032e-01, 7.7505398e-08, 3.3336248e-06, 2.5884779e-05,
        3.8086569e-07, 2.9809735e-11, 1.5526874e-09],
       [9.9877077e-01, 3.9581527e-04, 4.6124629e-04, 2.2211674e-04,
        1.4129023e-04, 2.1510405e-07, 8.4938501e-06],
       [9.9991000e-01, 1.4279389e-06, 8.4997072e-07, 8.7313558e-05,
        5.1179370e-07, 2.6994038e-10, 9.8725428e-09]], dtype=float32)

In [37]:
df_ticker_news['sent'] = np.argmax(prediction, axis=-1)
df_ticker_news

Unnamed: 0,time,title,sent
0,Jul-13-21 06:31PM,Dow Jones Futures Await Fed Chief Powell As In...,2
1,06:00PM,Tesla Stock Is Falling. Goldman Sachs Is Raisi...,2
2,03:16PM,Forget Tesla fundamentals and watch how the te...,0
3,03:00PM,Why Tesla Stock Just Gave Back Half of Yesterd...,0
4,02:00PM,Is Li Auto Stock A Buy Now? Shares Jump 45% In...,0
...,...,...,...
95,11:29AM,Why Tesla Stock Stumbled on Thursday,0
96,10:20AM,XPeng President on Tesla: Chinese players are ...,2
97,10:10AM,Teslas June China Delivery Numbers Were Good. ...,0
98,09:29AM,"Teslas June China Delivery Numbers Were Good, ...",0


In [38]:
df_ticker_news['sent'].value_counts()

0    50
2    36
4     6
3     5
5     2
6     1
Name: sent, dtype: int64

## Conclusion

After training the model 6 times, we see 5-6 times is the optimal for modeling building as the accuracy between training and validation set narrows when epoch reaches 4 and 5. Further training will over-fit the model. The model results in a shy of 99% accuracy score, which is very good considering without excessive adjustment of the original dataset. Additionally, it correctly classifies the 9 sample headlines entered.

In the end, it classifies 50 of 100 recent news headlines for Tesla as 'neutral', which seems skewed. Logically speaking, it, however, is normal as most news do not have direct impact of the corporate or analyst actions. Hence, most news do not move the stock price.

### Further work

We can make the dataset more balanced by adding more news in the categories other than 'neutral', which has the key of 0 in sentiment_dict.

We can also try other neural network models and add more layers to optimize the model.