In [1]:
import pandas as pd
import numpy as np
import datetime
import logging
import os

In [2]:
files = [f for f in os.listdir('prepared') if f.endswith('.csv')]

In [3]:
if not os.path.exists('sentiment'):
    os.makedirs('sentiment')

In [4]:
from tqdm import tqdm
tqdm.pandas()

In [5]:
if not os.path.exists('log'):
    os.makedirs('log')

## Load sentiment analysis libraries

In [6]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
def get_sentiment_vader(text):
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"vader:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    score = analyzer.polarity_scores(text)
    return score['compound']

In [8]:
from pattern.en import sentiment

In [9]:
def get_sentiment_pattern(text):
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"pattern:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    return sentiment(text)

In [10]:
import torch
# Select GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax

model_src = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'

nlp = AutoModelForSequenceClassification.from_pretrained(model_src).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_src)
config = AutoConfig.from_pretrained(model_src)

tokenizer.save_pretrained(model_src)
nlp.save_pretrained(model_src)

In [12]:
def get_sentiment_xlm(text):    
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"xlm-t:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    if len(text) > 512: text = text[:512]
    
    token = tokenizer(text, return_tensors='pt').to(device)
    output = nlp(**token)
    scores = softmax(output[0][0].detach().cpu().numpy())

    return (scores[0] * -1) + scores[2]

## Get sentiment scores

In [None]:
for f in files:
    symbol = f.split('.')[0]
    logging.basicConfig(filename=f'log/{symbol}.log', level=logging.INFO)
    df = pd.read_csv(f'prepared/{symbol}.csv').drop(['stock_symbol'], axis=1)
    
    print(f'Calculating: {symbol.upper()}')
    
    # Vader
    counter = 0
    total = len(df['combined'])
    analyzer = SentimentIntensityAnalyzer()
    df['polarity_vader'] = df['combined'].progress_apply(get_sentiment_vader)
    
    # Pattern
    counter = 0
    df['pattern_temp'] = df['combined'].progress_apply(get_sentiment_pattern)
    df[['polarity_pattern', 'subjectivity_pattern']] = df['pattern_temp'].apply(pd.Series)
    df = df.drop(['pattern_temp'], axis=1)
    
    # XLM-T
    counter = 0
    df['polarity_xlm-t'] = df['combined'].progress_apply(get_sentiment_xlm)
    
    df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

Calculating: MSFT


100%|██████████| 30137/30137 [02:05<00:00, 240.39it/s]
100%|██████████| 30137/30137 [01:56<00:00, 257.70it/s]
100%|██████████| 30137/30137 [07:17<00:00, 68.81it/s]


Calculating: AAPL


100%|██████████| 39306/39306 [02:48<00:00, 233.20it/s]
100%|██████████| 39306/39306 [02:45<00:00, 237.36it/s]
100%|██████████| 39306/39306 [09:37<00:00, 68.04it/s]


Calculating: MCD


100%|██████████| 19721/19721 [01:00<00:00, 328.04it/s]
100%|██████████| 19721/19721 [00:59<00:00, 332.11it/s]
100%|██████████| 19721/19721 [04:36<00:00, 71.28it/s]


Calculating: NVDA


100%|██████████| 17070/17070 [01:38<00:00, 172.88it/s]
100%|██████████| 17070/17070 [01:28<00:00, 193.63it/s]
100%|██████████| 17070/17070 [03:56<00:00, 72.17it/s]


Calculating: TSLA


100%|██████████| 73272/73272 [03:29<00:00, 349.47it/s] 
100%|██████████| 73272/73272 [03:25<00:00, 357.19it/s] 
100%|██████████| 73272/73272 [17:42<00:00, 68.95it/s]


Calculating: NFLX


100%|██████████| 10263/10263 [01:06<00:00, 155.11it/s]
100%|██████████| 10263/10263 [01:00<00:00, 170.09it/s]
100%|██████████| 10263/10263 [02:27<00:00, 69.39it/s]


Calculating: GME


100%|██████████| 570328/570328 [13:14<00:00, 717.66it/s] 
100%|██████████| 570328/570328 [13:41<00:00, 694.51it/s] 
100%|██████████| 570328/570328 [2:23:57<00:00, 66.03it/s]  
