In [28]:
import pandas as pd
import numpy as np

In [29]:
df = pd.read_csv('nasdaq_articles.csv', index_col=0)
df[:5]

Unnamed: 0,title,description,return,sentiment
3/20/2020 0:26,Apple Is Back In The Buying Zone,Apple currently trading slightly below fair va...,-3.79,1.0
3/20/2020 2:51,Stock futures jump as tumultuous week comes to...,Central banks have taken measures to buffer th...,-3.79,1.0
3/20/2020 3:13,Oncopeptides Provides Update Regarding COVID-1...,Oncopeptides AB (Nasdaq Stockholm: ONCO) annou...,-3.79,1.0
3/20/2020 3:24,"US stock futures gain, dollar snaps eight-day ...",Asian stocks and US futures gained and the dol...,-3.79,1.0
3/20/2020 3:49,Immunovia Publishes the Annual Report for the ...,Immunovia has today published the annual repor...,-3.79,0.0


In [30]:
# when stock gain is more than 1% then it is a positive market signal
df['market_signal'] = np.where(df['return']>=1, 1, 0)

In [31]:
# when stock loss is more than 1% then it is a negative market signal
df['market_signal'] = np.where(df['return']<=-1, -1, df['market_signal'])

In [33]:
df

Unnamed: 0,title,description,return,sentiment,market_signal
3/20/2020 0:26,Apple Is Back In The Buying Zone,Apple currently trading slightly below fair va...,-3.79,1.0,-1
3/20/2020 2:51,Stock futures jump as tumultuous week comes to...,Central banks have taken measures to buffer th...,-3.79,1.0,-1
3/20/2020 3:13,Oncopeptides Provides Update Regarding COVID-1...,Oncopeptides AB (Nasdaq Stockholm: ONCO) annou...,-3.79,1.0,-1
3/20/2020 3:24,"US stock futures gain, dollar snaps eight-day ...",Asian stocks and US futures gained and the dol...,-3.79,1.0,-1
3/20/2020 3:49,Immunovia Publishes the Annual Report for the ...,Immunovia has today published the annual repor...,-3.79,0.0,-1
...,...,...,...,...,...
3/9/2020 21:01,Dow drops 7.8% as virus fears slam markets,Dow drops 7.8% as virus fears slam marketsThe ...,-7.29,,-1
3/9/2020 21:12,Stock Markets Climb After Trump Proposes Tax C...,Stock Markets Climb After Trump Proposes Tax C...,-7.29,,-1
3/9/2020 21:38,Why These Apparel Retailers Plunged on Monday,Investors should take today's sell-off with a ...,-7.29,,-1
3/9/2020 21:45,Quenching The Thirst For Earnings With PepsiCo,PepsiCo continues to show its market power and...,-7.29,,-1


In [7]:
from bert_embedding import BertEmbedding

bert_embedding = BertEmbedding(model='bert_24_1024_16', dataset_name='book_corpus_wiki_en_cased')

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['market_signal'],axis=1), df['market_signal'], test_size=0.2, random_state=777, shuffle=True)

In [39]:
# Use news article titles to perform sentiment analysis
X_train = X_train['title']
X_test = X_test['title']

In [40]:
sample = bert_embedding.embedding(df.iloc[:5]['title'], oov_way='sum')

In [43]:
sample

[(['Apple', 'Is', 'Back', 'In', 'The', 'Buying', 'Zone'],
  [array([-0.26620585,  0.09960835, -0.06498528, ..., -0.00821856,
           0.09705348,  0.13480124], dtype=float32),
   array([-0.26517224,  0.20200601, -0.06120298, ...,  1.2984374 ,
           0.10152386, -0.0021737 ], dtype=float32),
   array([-0.6680085 ,  1.1247057 ,  0.05568182, ...,  0.19554105,
           0.35412467, -0.7391632 ], dtype=float32),
   array([-0.25469947,  0.45662323,  0.20557338, ...,  0.7105194 ,
           0.7904498 , -0.31020766], dtype=float32),
   array([-0.2515772 ,  0.2837445 ,  0.9116112 , ...,  1.1254003 ,
           0.24275365, -0.2720462 ], dtype=float32),
   array([-0.05769417,  0.8088425 ,  0.1724812 , ...,  1.2114066 ,
          -0.13831693, -0.12363467], dtype=float32),
   array([-0.27680978,  0.55262566, -0.76793575, ...,  0.8650224 ,
          -0.00880259, -0.94105786], dtype=float32)]),
 (['Stock',
   'futures',
   'jump',
   'as',
   'tumultuous',
   'week',
   'comes',
   'to',
   'a

In [44]:
for vector in sample:
    sentence, word_vec = vector
    word_vec = np.array(word_vec)
    print(np.mean(word_vec, axis=0).shape)

(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
