In [0]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import spacy
from spacy import displacy
nlp = spacy.load('en')

import nltk
from nltk.corpus import treebank
nltk.download('treebank')

# for Sentiment Analysis
from textblob import TextBlob

In [0]:
sentence = 'Apple is a good company but Tim Cook is not innovative'
doc = nlp(sentence)

In [0]:
# POS Tagging
# https://spacy.io/api/annotation#pos-tagging
print([(token.text, token.pos_) for token in doc])

In [0]:
# NER (Named Entity Recognition)
displacy.render(doc, style='ent', jupyter=True)

In [0]:
# Dependency Parsing
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [0]:
# Tree Bank
words = treebank.words()

print("Word Count", len(words))
print(words[:17])

parsed = treebank.parsed_sents()[0]
print(parsed)

In [0]:
# Sentiment analysis
# polarity is a float within the range [-1.0, 1.0] where 0.0 is very negative and 1.0 is very positive
tb = TextBlob(sentence)
tb.polarity

In [0]:
# subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective
tb.sentiment_assessments

In [0]:
# Use the Apple Tweets to predict the Apple stock price daily change
url = 'https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/appleTweetsCleaned.xlsx'
appleTweets = pd.read_excel(url)
appleTweets

In [0]:
appleTweets.columns

In [0]:
# Filter the content by language and hastag
LANG = 'en'
HASH_TAGS = '#Apple'

# use contains for hashtags
appleTweetsFiltered = appleTweets[(appleTweets['Tweet language (ISO 639-1)'] == LANG) & (appleTweets['Hashtags'].str.contains(HASH_TAGS))]
appleTweetsFiltered.reset_index(drop=True, inplace=True)
appleTweetsFiltered

In [0]:
# Look at the subset of useful columns for the sentiment trading
COLUMNS = ['Date', 'cleaned', 'Following']
appleTweetsSubset = appleTweetsFiltered[COLUMNS]

# Convert Date string to datetime to match with the stock daily change later
appleTweetsSubset['Date'] = pd.to_datetime(appleTweetsSubset['Date'])
appleTweetsSubset

In [0]:
# Use TextBlob to run the tweets sentiment polarity
appleTweetsSubset['sentiment'] = appleTweetsSubset['cleaned'].apply(lambda x: TextBlob(x).polarity)

# Weight the tweets sentiment importance by the number of followings
appleTweetsSubset['sentiment_weighted'] = appleTweetsSubset['sentiment'] * appleTweetsSubset['Following']
appleTweetsSubset

In [0]:
# Check Sentiment assessment
INDEX = 0

text = appleTweetsSubset.iloc[INDEX]['cleaned']
print(text)

TextBlob(text).sentiment_assessments

In [0]:
# Group the sentiment by Date
aggregateSentiments = appleTweetsSubset.groupby(['Date']).sum()
aggregateSentiments

In [0]:
# get stocks daily data (OHLCV) from Yahoo
import pandas_datareader.data as web
from datetime import datetime

start = datetime(2016, 4, 2) 
end = datetime(2016, 4, 30) 
stock= web.DataReader('AAPL', 'yahoo', start=start, end=end)
stock

In [0]:
# calculate the stock daily change
stock['change'] = (stock['Close'] - stock['Open']) / stock['Open']
stock

In [0]:
# Merge the daily stock price info with the sentiments
merged = stock.merge(aggregateSentiments, on='Date', how='left')[['change', 'sentiment_weighted']]
merged

In [0]:
# Scale the unit to -1 to 1
scaler = MinMaxScaler((-1, 1))
merged['change'] = scaler.fit_transform(merged[['change']])
merged['sentiment_weighted'] = scaler.fit_transform(merged[['sentiment_weighted']])
merged

In [0]:
merged.plot(figsize=(15, 8))

In [0]:
# shows the correlation
merged.corr()

In [0]:
# Try with different date lag
# Use sentiment moving average
# Filter less contents for sentiment
# Use a better sentiment analysis engine (e.g. tailored made with social media content)
# Add other technical indicator
#merged.change = merged.change.shift(-1)