In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import flair
from flair.data import Sentence
from flair.nn import Classifier
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from scipy.special import softmax
import seaborn as sns
import matplotlib as plt

In [62]:
# Dataset of reddit comments with an analysis score of 1, 0, or -1
df = pd.read_csv('Reddit_Data.csv')

# Load the sentiment analysis model - for hugging face
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
# PT
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load the sentiment analysis model - for flair
tagger = Classifier.load('sentiment-fast')

# Load the model for NLTK
# only download if not already downloaded
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()



2024-05-06 15:14:54,562 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-fasttext-rnn/sentiment-en-mix-ft-rnn_v8.pt not found in cache, downloading to /var/folders/bt/llk01x0n1ds62ww5wg13kz8m0000gn/T/tmpbhq8feii


100%|██████████| 1.16G/1.16G [04:06<00:00, 5.03MB/s]

2024-05-06 15:19:01,984 copying /var/folders/bt/llk01x0n1ds62ww5wg13kz8m0000gn/T/tmpbhq8feii to cache at /Users/shasankpatel/.flair/models/sentiment-en-mix-ft-rnn_v8.pt





2024-05-06 15:19:02,211 removing temp file /var/folders/bt/llk01x0n1ds62ww5wg13kz8m0000gn/T/tmpbhq8feii


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/shasankpatel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [63]:
df

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [64]:
test_df = df['clean_comment'].copy().astype(str).head(3000)

In [65]:
# Test flair
df_f = df.copy().head(30)
df_f['clean_comment'] = df_f['clean_comment'].copy().astype(str)
# Convert body to Flair sentences
df_f['sentence'] = df_f['clean_comment'].apply(lambda x: Sentence(x)) # type: ignore
# Perform sentiment analysis
df_f['sentence'].apply(lambda x: tagger.predict(x))
df_f['sentiment_result'] = df_f['sentence'].apply(lambda x: x.labels[0].value)
df_f['sentiment_score'] = df_f['sentence'].apply(lambda x: x.labels[0].score)
df_f.drop(columns=['clean_comment', 'sentence'], inplace=True)
df_f

Unnamed: 0,category,sentiment_result,sentiment_score
0,1,POSITIVE,0.998648
1,1,POSITIVE,0.999899
2,-1,NEGATIVE,0.865275
3,0,POSITIVE,0.853202
4,1,POSITIVE,0.999228
5,-1,POSITIVE,0.691604
6,1,POSITIVE,0.999816
7,0,NEGATIVE,0.76339
8,-1,POSITIVE,0.99911
9,1,NEGATIVE,0.582793


In [66]:
# Test hugging face
# Preprocess text ???
df_hf = df.copy().head(30)
# tokenize
df_hf['tokens'] = df_hf['clean_comment'].apply(lambda x: tokenizer(x, padding=True, truncation=True,
                                                        max_length=512, return_tensors='pt'))
df_hf['output'] = df_hf['tokens'].apply(lambda x: model(**x))
df_hf['scores'] = df_hf['output'].apply(lambda x: softmax(x[0][0].detach().numpy()))
df_hf['ranking'] = df_hf['scores'].apply(lambda x: x.argsort()[::-1])
df_hf['sentiment_result'] = df_hf['ranking'].apply(lambda x: config.id2label[x[0]])
df_hf['first_ranking'] = df_hf['ranking'].apply(lambda x: x[0])
# only do the below if first_ranking isn't empty
if not df_hf['first_ranking'].empty:
    df_hf['sentiment_score'] = df_hf.apply(lambda x: x['scores'][x['first_ranking']], axis=1)
#df['sentiment_score'] = df.apply(lambda x: x['scores'][x['ranking'][0]], axis=1)
# Drop the intermediate columns
df_hf.drop(columns=['clean_comment', 'tokens', 'output', 'scores', 'ranking'], inplace=True)
df_hf

Unnamed: 0,category,sentiment_result,first_ranking,sentiment_score
0,1,neutral,1,0.724173
1,1,neutral,1,0.566622
2,-1,negative,0,0.734263
3,0,neutral,1,0.799716
4,1,neutral,1,0.674569
5,-1,neutral,1,0.857794
6,1,positive,2,0.692141
7,0,neutral,1,0.890323
8,-1,neutral,1,0.577485
9,1,neutral,1,0.612292


In [67]:
df_v = df.copy().head(30)
df_v['sentiment_tmp'] = df_v['clean_comment'].apply(lambda x: sia.polarity_scores(x))
df_v['sentiment_score'] = df_v['sentiment_tmp'].apply(lambda x: x['compound'])
df_v['sentiment_result'] = df_v['sentiment_score'].apply(lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral'))
df_v.drop(columns=['sentiment_tmp', 'clean_comment'], inplace=True)
df_v

Unnamed: 0,category,sentiment_score,sentiment_result
0,1,0.9349,positive
1,1,0.9953,positive
2,-1,0.875,positive
3,0,0.0772,positive
4,1,0.8907,positive
5,-1,0.3626,positive
6,1,0.9806,positive
7,0,0.0,neutral
8,-1,0.8625,positive
9,1,0.6131,positive


In [None]:
# Flair - Sentiment Score Range of -4 to 4
# HF - Sentiment Score Range of -4 to 4
# Vader - Sentiment Score Range of -1 to 1