In [1]:
import pandas as pd
import numpy as np
import datetime
import logging

In [2]:
symbol = 'aapl'

In [3]:
logging.basicConfig(filename=f'log/{symbol}.log', level=logging.INFO)

In [4]:
df = pd.read_csv(f'prepared/{symbol}.csv').drop(['stock_symbol'], axis=1)
df.head()

Unnamed: 0,id,created_utc,subreddit,author,title,selftext,permalink,url,combined,is_removed,is_deleted
0,7nca41,2018-01-01 01:04:25,wallstreetbets,alreadyinuse5000,Blowing versus sucking,AAPL just entered a contract to purchase 51 of...,/r/wallstreetbets/comments/7nca41/blowing_vers...,https://www.reddit.com/r/wallstreetbets/commen...,Blowing versus sucking AAPL just entered a con...,False,False
1,7nfzp3,2018-01-01 16:37:41,RobinHood,CardinalNumber,The 2018 /r/Robinhood Stock Picking Game,# tl;dr\n\n - Stock picking game will last all...,/r/RobinHood/comments/7nfzp3/the_2018_rrobinho...,https://www.reddit.com/r/RobinHood/comments/7n...,The 2018 /r/Robinhood Stock Picking Game # tl;...,False,False
2,7nhc2r,2018-01-01 20:21:33,investing,InvestingLifeSavings,Hesitant to invest in $AAPL,Looking at AAPLs fundamentals and the pile of ...,/r/investing/comments/7nhc2r/hesitant_to_inves...,https://www.reddit.com/r/investing/comments/7n...,Hesitant to invest in $AAPL Looking at AAPLs f...,False,False
3,7nhwud,2018-01-01 21:54:14,stocks,Djg35,Stock mix help required (ETF),I’ve decided I’m most likely interested in jus...,/r/stocks/comments/7nhwud/stock_mix_help_requi...,https://www.reddit.com/r/stocks/comments/7nhwu...,Stock mix help required (ETF) I’ve decided I’m...,False,False
4,7ni0o7,2018-01-01 22:12:01,personalfinance,Djg35,ETF advice please!,I’ve decided I’m most likely interested in jus...,/r/personalfinance/comments/7ni0o7/etf_advice_...,https://www.reddit.com/r/personalfinance/comme...,ETF advice please! I’ve decided I’m most likel...,False,False


In [5]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
if not os.path.exists('sentiment'):
    os.makedirs('sentiment')

## Get sentiment using vader

In [6]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [7]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [8]:
counter = 0
total = len(df['combined'])
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"vader:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    score = analyzer.polarity_scores(text)
    return score['compound']

In [9]:
# df['sentiment_vader'] = df['combined'].apply(get_sentiment)
df['polarity_vader'] = df['combined'].progress_apply(get_sentiment)

100%|██████████| 39306/39306 [02:28<00:00, 264.10it/s] 


### Export file

In [10]:
df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using pattern

In [11]:
from pattern.en import sentiment

In [12]:
counter = 0

def get_sentiment(text):
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"pattern:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    return sentiment(text)

In [13]:
df['pattern_temp'] = df['combined'].progress_apply(get_sentiment)

100%|██████████| 39306/39306 [02:28<00:00, 265.39it/s]


In [14]:
df[['polarity_pattern', 'subjectivity_pattern']] = df['pattern_temp'].apply(pd.Series)
df = df.drop(['pattern_temp'], axis=1)

### Export file

In [15]:
df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using CardiffNLP

In [16]:
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax

In [17]:
model_src = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'

nlp = AutoModelForSequenceClassification.from_pretrained(model_src)
tokenizer = AutoTokenizer.from_pretrained(model_src)
config = AutoConfig.from_pretrained(model_src)

tokenizer.save_pretrained(model_src)
nlp.save_pretrained(model_src)

In [18]:
counter = 0

def get_sentiment(text):    
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"cardiff:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    if len(text) > 512: text = text[:512]
    
    token = tokenizer(text, return_tensors='pt')
    output = nlp(**token)
    scores = softmax(output[0][0].detach().numpy())
    
    return (scores[0] * -1) + scores[2]

In [19]:
df['polarity_bert'] = df['combined'].progress_apply(get_sentiment)

100%|██████████| 39306/39306 [54:32<00:00, 12.01it/s]  


### Export file

In [20]:
df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using textblob

In [21]:
# from textblob import TextBlob

In [22]:
# counter = 0

# def get_sentiment(text):
#     global counter
    
#     counter += 1
#     if counter % 1000 == 0:
#         timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#         logging.info(f"textblob:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
#     return TextBlob(text).sentiment

In [23]:
# df['textblob_temp'] = df['combined'].progress_apply(get_sentiment)

In [24]:
# df[['polarity_textblob', 'subjectivity_textblob']] = df['textblob_temp'].apply(pd.Series)
# df = df.drop(['textblob_temp'], axis=1)

### Export file

In [25]:
# df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using flair

In [26]:
# from flair.data import Sentence
# from flair.nn import Classifier

In [27]:
# counter = 0

# def get_sentiment(text):
#     global counter
#     sentence = Sentence(text)
#     tagger = Classifier.load('sentiment-fast')
#     tagger.predict(sentence)

#     label = sentence.labels[0].value
#     score = sentence.labels[0].score
    
#     counter += 1
#     if counter % 1000 == 0:
#         timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#         logging.info(f"flair:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
#     return score if label == 'POSITIVE' else -score

In [28]:
# df['sentiment_flair'] = df['combined'].apply(get_sentiment)
# df['sentiment_flair'] = df['combined'].progress_apply(get_sentiment)

### Export file

In [29]:
# df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)