In [0]:
# spaCy NLP library
import spacy
from spacy import displacy
nlp = spacy.load('en')

# download tree bank
import nltk
from nltk.corpus import treebank
nltk.download('treebank')

#display tree bank
!pip install svgling
import svgling

# for Sentiment Analysis
from textblob import TextBlob

from sklearn.preprocessing import MinMaxScaler
import pandas as pd
pd.options.mode.chained_assignment = None

%load_ext google.colab.data_table 

In [0]:
sentence = 'Apple is doing good in China at 2020 and it is the second best in the history as the profit is over 10 billions. However, Tim Cook is not innovative'
doc = nlp(sentence)

In [0]:
# POS Tagging
# https://spacy.io/api/annotation#pos-tagging
print([(token.text, token.pos_) for token in doc])

In [0]:
# NER (Named Entity Recognition)
displacy.render(doc, style='ent', jupyter=True)

In [0]:
# Dependency Parsing
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [0]:
# Tree Bank
words = treebank.words()

print("Word Count", len(words))
print(words[:17])

parsed = treebank.parsed_sents()[0]
svgling.draw_tree(parsed)

In [0]:
# Sentiment analysis by TextBlob
# https://planspace.org/20150607-textblob_sentiment/
# https://github.com/sloria/TextBlob/blob/eb08c120d364e908646731d60b4e4c6c1712ff63/textblob/en/en-sentiment.xml
# Rule and pattern based. Handle “negation”  and “modifier” words (such as very)
# polarity is a float within the range [-1.0, 1.0] where 0.0 is very negative and 1.0 is very positive

print(sentence)
tb = TextBlob(sentence)
tb.polarity

In [0]:
# subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective
# Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information

tb.sentiment_assessments

In [0]:
# Use the Apple Tweets to predict the Apple stock price daily change
url = 'https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/appleTweets.xlsx'
appleTweets = pd.read_excel(url)

appleTweets.shape

In [0]:
# Show partial results
SHOW_NUMS = 5000
appleTweets[:SHOW_NUMS]

In [0]:
appleTweets.columns

In [0]:
# Look at the subset of useful columns for the sentiment trading
COLUMNS = ['Date', 'Tweet content', 'Following', 'Hashtags']
appleTweetsSubset = appleTweets[COLUMNS]

appleTweetsSubset[:SHOW_NUMS]

In [0]:
# Filter the contents 
MIN_FOLLOWING = 1
HASH_TAGS = '#Apple'

# Filter tweet with at least the MIN_FOLLOWING
appleTweetsFiltered = appleTweetsSubset[(appleTweetsSubset['Following'] >= MIN_FOLLOWING)]
appleTweetsFiltered.drop(columns=['Hashtags'], inplace=True)

# Filter hastags (note: use contains for hashtags)
#appleTweetsFiltered = appleTweetsSubset[(appleTweetsSubset['Hashtags'].str.contains(HASH_TAGS))]

appleTweetsFiltered.reset_index(drop=True, inplace=True)

print("Original row# :", appleTweetsSubset.shape[0])
print("Filtered row# :", appleTweetsFiltered.shape[0])


In [0]:
appleTweetsFiltered[:SHOW_NUMS]

In [0]:
# Use TextBlob to run the tweets sentiment polarity
appleTweetsFiltered['sentiment'] = appleTweetsFiltered['Tweet content'].apply(lambda x: TextBlob(x).polarity)

# Weight the tweets sentiment importance by the number of followings
appleTweetsFiltered['sentiment_weighted'] = appleTweetsFiltered['sentiment'] * appleTweetsSubset['Following']

appleTweetsFiltered[:SHOW_NUMS]

In [0]:
# Check Sentiment assessment
INDEX = 0

text = appleTweetsFiltered.iloc[INDEX]['Tweet content']
print(text)

TextBlob(text).sentiment_assessments

In [0]:
# Group the weighted sentiment by Date for matching the stock daily change

# Convert Date string to datetime to match with the stock daily change later
appleTweetsFiltered['Date'] = pd.to_datetime(appleTweetsFiltered['Date'])

aggregateSentiments = appleTweetsFiltered.groupby(['Date']).sum()[['sentiment_weighted']]
appleTweetsFiltered

In [0]:
# get stocks daily data (OHLCV) from Yahoo
import pandas_datareader.data as web
from datetime import datetime

start = datetime(2016, 4, 2) 
end = datetime(2016, 4, 30) 
stock= web.DataReader('AAPL', 'yahoo', start=start, end=end)
stock

In [0]:
# calculate the stock daily change
stock['change'] = (stock['Close'] - stock['Open']) / stock['Open']
stock

In [0]:
# Merge the daily stock price info with the sentiments
merged = stock.merge(aggregateSentiments, on='Date', how='left')[['change', 'sentiment_weighted']]
merged

In [0]:
# Scale the unit to -1 to 1
scaler = MinMaxScaler((-1, 1))
merged['changes'] = scaler.fit_transform(merged[['change']])
merged['sentiments'] = scaler.fit_transform(merged[['sentiment_weighted']])
scaled = merged[['changes', 'sentiments']]
scaled

In [0]:
scaled.plot(figsize=(15, 8))

In [0]:
# shows the correlation
scaled.corr()

In [0]:
# Try sentiments with different date lags

# Sentiment shift backwards -> Current day sentiments predicts next day stock price change (predictive)
scaled['sentiment-1'] = merged['sentiments'].shift(-1)

# Sentiment shift forwards -> Current day sentiments reflects yesterday's price change (reactive)
scaled['sentiment+1'] = merged['sentiments'].shift(1)
scaled

In [0]:
scaled.corr()

In [0]:
# Try different dataset http://followthehashtag.com/datasets/nasdaq-100-companies-free-twitter-dataset/
# Filter contents with different conditions
# Take sentiment subjective into account
# Use a better sentiment analysis engine (e.g. tailored made with social media content)
# Try with different date lags and compares the close price changes rather than open-close changes
# Use sentiment moving average or long/short term sentiment cross over
# Combine with other technical indicators such as stock price moving average
# Combine with other Machine Learned signals or trends
