# Roberta-base-sentiment VS vaderSentiment

In [8]:
# Load Packages
import pandas as pd
from transformers import TFAutoModelForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [9]:
df = pd.read_csv('output.csv')

In [10]:
LABELS  = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

## Roberta-base-sentiment Analysis

In [11]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"

In [12]:
def roberta_get_sentiment_scores(text: str, tokenizer, model):
    text = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    output = model(**text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

def roberta_analyze_sentiment(df: pd.DataFrame)  -> pd.DataFrame:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    scores = df['comment_en'].apply(lambda x: roberta_get_sentiment_scores(str(x), tokenizer, model)) 
    df[['roberta_negative', 'roberta_neutral', 'roberta_positive']] = scores.apply(pd.Series)
    df['roberta_label'] = scores.apply(lambda x: LABELS[np.argmax(x)])
    return df

In [13]:
df = roberta_analyze_sentiment(df)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
polarity_weights = torch.tensor([-1, 0, 1])
probs = torch.tensor(df[["roberta_negative", "roberta_neutral", "roberta_positive"]].values)
polarity = (polarity_weights * probs).sum(dim=-1)
polarity_scaled = nn.Tanh()(polarity)
df["roberta_polarity"] = polarity_scaled.numpy()

## vaderSentiment

In [15]:
def vader_analyze_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    analyzer = SentimentIntensityAnalyzer()

    df['Score'] = df['comment_en'].apply(lambda x: analyzer.polarity_scores(str(x)))
    df_scores = df['Score'].apply(pd.Series)
    df_scores['vader_label'] = df_scores[['neg','neu','pos']].apply(lambda x: LABELS[np.argmax(x.values)], axis=1)
    df = pd.concat([df, df_scores], axis=1).drop(columns=['Score'])
    return   df.rename(columns={'neg': 'vader_negative',
                       'neu': 'vader_neutral',
                       'pos': 'vader_positive'
                       })

In [16]:
df = vader_analyze_sentiment(df)

In [17]:
polarity_weights = torch.tensor([-1, 0, 1])
probs = torch.tensor(df[["vader_negative", "vader_neutral", "vader_positive"]].values)
polarity = (polarity_weights * probs).sum(dim=-1)
polarity_scaled = nn.Tanh()(polarity)
df["vader_polarity"] = polarity_scaled.numpy()

In [None]:
df.to_csv('output_sentiment.csv', encoding='utf-8', index=False)
