In [1]:
import pandas as pd
import datetime
import logging

In [2]:
symbol = 'nvda'

In [3]:
logging.basicConfig(filename=f'{symbol}_log_file.log', level=logging.INFO)

In [4]:
df = pd.read_csv(f'prepared/{symbol}.csv').drop(['stock_symbol'], axis=1)
df.head()

Unnamed: 0,id,created_utc,subreddit,author,title,selftext,permalink,url,combined,is_removed,is_deleted
0,7ndtyy,2018-01-01 06:53:13,TIMESINDIAauto,AutoNewsAdmin,[Business] - New year looks grim for McD’s staff,,/r/TIMESINDIAauto/comments/7ndtyy/business_new...,https://timesofindia.indiatimes.com/business/i...,[Business] - New year looks grim for McD’s staff,False,False
1,7ndu9l,2018-01-01 06:55:27,AutoNewspaper,AutoNewspaperAdmin,[Business] - New year looks grim for McD’s sta...,,/r/AutoNewspaper/comments/7ndu9l/business_new_...,https://timesofindia.indiatimes.com/business/i...,[Business] - New year looks grim for McD’s sta...,False,False
2,7nfc46,2018-01-01 14:22:18,OnlineBargains,dfslol,[McDonalds] Any size McCafé® Brewed Coffee for...,,/r/OnlineBargains/comments/7nfc46/mcdonalds_an...,http://forums.redflagdeals.com/mcdonalds-any-s...,[McDonalds] Any size McCafé® Brewed Coffee for...,False,False
3,7ng02z,2018-01-01 16:39:39,superhot,[deleted],Some suggestions upon playing a bit of MCD,[deleted],/r/superhot/comments/7ng02z/some_suggestions_u...,https://www.reddit.com/r/superhot/comments/7ng...,Some suggestions upon playing a bit of MCD,False,True
4,7nghsp,2018-01-01 18:03:39,GameDeals,gamedealsmod,Steam Winter Sale : Day 12,**Steam Winter Sale 2017** - [Day 1](https://r...,/r/GameDeals/comments/7nghsp/steam_winter_sale...,https://www.reddit.com/r/GameDeals/comments/7n...,Steam Winter Sale : Day 12 **Steam Winter Sale...,False,False


In [5]:
from tqdm import tqdm
tqdm.pandas()

## Get sentiment using vader

In [6]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [7]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [8]:
counter = 0
total = len(df['combined'])
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"vader:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    score = analyzer.polarity_scores(text)
    return score['compound']

In [9]:
# df['sentiment_vader'] = df['combined'].apply(get_sentiment)
df['polarity_vader'] = df['combined'].progress_apply(get_sentiment)

100%|██████████| 19721/19721 [00:51<00:00, 382.59it/s]


## Export file

In [10]:
df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using pattern

In [11]:
from pattern.en import sentiment

In [12]:
counter = 0

def get_sentiment(text):
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"pattern:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    return sentiment(text)

In [13]:
df['pattern_temp'] = df['combined'].progress_apply(get_sentiment)

100%|██████████| 19721/19721 [00:51<00:00, 381.57it/s]


In [14]:
df[['polarity_pattern', 'subjectivity_pattern']] = df['pattern_temp'].apply(pd.Series)
df = df.drop(['pattern_temp'], axis=1)

## Export file

In [15]:
df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using BERT

In [16]:
from transformers import pipeline, AutoTokenizer
nlp = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment')

2023-08-05 13:38:20.739248: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-05 13:38:20.788808: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
counter = 0

def get_sentiment(text):    
    global counter
    
    counter += 1
    if counter % 1000 == 0:
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(f"bert:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
    if len(text) > 512:
        result = nlp(text[:512])
    else:
        result = nlp(text[:512])
    
    return int(result[0]['label'][:1]) - 3

In [18]:
df['polarity_bert'] = df['combined'].progress_apply(get_sentiment)

100%|██████████| 19721/19721 [23:47<00:00, 13.82it/s]


## Export file

In [19]:
df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using textblob

In [20]:
# from textblob import TextBlob

In [21]:
# counter = 0

# def get_sentiment(text):
#     global counter
    
#     counter += 1
#     if counter % 1000 == 0:
#         timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#         logging.info(f"textblob:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
#     return TextBlob(text).sentiment

In [22]:
# df['textblob_temp'] = df['combined'].progress_apply(get_sentiment)

In [23]:
# df[['polarity_textblob', 'subjectivity_textblob']] = df['textblob_temp'].apply(pd.Series)
# df = df.drop(['textblob_temp'], axis=1)

## Export file

In [24]:
# df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)

## Get sentiment using flair

In [25]:
# from flair.data import Sentence
# from flair.nn import Classifier

In [26]:
# counter = 0

# def get_sentiment(text):
#     global counter
#     sentence = Sentence(text)
#     tagger = Classifier.load('sentiment-fast')
#     tagger.predict(sentence)

#     label = sentence.labels[0].value
#     score = sentence.labels[0].score
    
#     counter += 1
#     if counter % 1000 == 0:
#         timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#         logging.info(f"flair:{timestamp}: {counter}/{total}, {counter/total*100}%")
    
#     return score if label == 'POSITIVE' else -score

In [27]:
# df['sentiment_flair'] = df['combined'].apply(get_sentiment)
# df['sentiment_flair'] = df['combined'].progress_apply(get_sentiment)

## Export file

In [28]:
# df.to_csv(f'sentiment/{symbol}_sentiment.csv', index=None)