# Comparing between different Sentimental Analaysis Models

Goal: To determine the best model on hand

Description:  I have a csv of headliners, and fed it to GrokAI to generate a list of sentimental Scores. This will be used as the benchmark in the comparison.

Steps outlined:
1. Setup the file "with_sentiment_100.csv" for comparison and briefly screen through the list for outliers
2. Run through the different models and run the data through them (We are interested in a score/number)
3. Compare using SST. Low=best High=worst

## 1. Setup testing file

In [None]:
import pandas as pd
df = pd.read_csv("data/with_sentiment_100.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
headline = list(df['Headline'])
headline[:10]

## 2. Run the list through different models

In [None]:
# set up

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text is None:
        return ""
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    test_sentence = " ".join(cleaned_tokens)
    return test_sentence

processed_headine = list(map(preprocess, headline))

In [None]:
# 1. Prebuilt Vader sentiment package (NaiveBayes model)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
result_vader = []
analyzer = SentimentIntensityAnalyzer()
for sentence in processed_headine:
    temp = analyzer.polarity_scores(sentence)
    result_vader.append(temp['pos']-temp['neg'])
result_vader[:10]
pd.DataFrame(result_vader).to_csv("data/vader.csv")

In [None]:
# 2. Prebuilt Textblob sentiment package

from textblob import TextBlob
result_tb = []
for sentence in processed_headine:
    result_tb.append(TextBlob(sentence).sentiment.polarity)
result_tb[:10]
pd.DataFrame(result_tb).to_csv("data/tb.csv")

In [None]:
# 3. Prebuilt Flair sentiment package/Model

from flair.data import Sentence
from flair.nn import Classifier
result_flair = []
tagger = Classifier.load('sentiment')
for sentence in processed_headine:
    sentence = Sentence(sentence)
    tagger.predict(sentence)
    score = sentence.labels[0].score
    result_flair.append(score)
result_flair[:10]
pd.DataFrame(result_flair).to_csv("data/flair.csv")

In [None]:
# setup for HuggingFace Transformers

from transformers import pipeline, set_seed
set_seed(999)

In [None]:
# RoBERTa

classifier = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest')
result_roberta = []
for sentence in processed_headine:
    score = 0
    temp = classifier(sentence)
    if temp['label'] == 'positive': score = temp[0]['score']
    elif temp['label'] == 'negative': score = -temp[0]['score']
    result_roberta.append(score)
result_roberta[:10]
pd.DataFrame(result_roberta).to_csv("data/roberta.csv")

In [None]:
# distilBERT

classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
result_dis = []

#test this

for sentence in processed_headine:
    score = 0
    temp = classifier(sentence)
    if temp['label'] == 'positive': score = temp[0]['score']
    elif temp['label'] == 'negative': score = -temp[0]['score']
    result_roberta.append(score)
result_dis[:10]
pd.DataFrame(result_dis).to_csv("data/dis.csv")

In [None]:
# Google Flan t5

classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
result = classifier(test_sentence)
print(result)

#test this

for sentence in processed_headine:
    score = 0
    temp = classifier(sentence)
    if temp['label'] == 'positive': score = temp[0]['score']
    elif temp['label'] == 'negative': score = -temp[0]['score']
    result_roberta.append(score)
result_dis[:10]
pd.DataFrame(result_dis).to_csv("data/dis.csv")

In [29]:
# LLM QWEN 8gb

from openai import OpenAI

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key="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",
)

result_ai = []

for sentence in processed_headine:
    completion = client.chat.completions.create(
      extra_body={}, model="deepseek/deepseek-r1-0528-qwen3-8b:free",
      messages=[{ "role": "user",
          "content": f"Only give me a float sentimental analysis score from -1(negative) to 1(positive) for the following sentence {sentence}"
    }])
    result_ai.append(completion.choices[0].message.content)
result_ai[:10]
pd.DataFrame(result_ai).to_csv("data/ai.csv")

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit exceeded: free-models-per-day. Add 10 credits to unlock 1000 free model requests per day', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '50', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1750204800000'}, 'provider_name': None}}, 'user_id': 'user_2ycwzV1AAeq9Nv8nUZlqVtkXzHr'}