## Computing raw scores produces but different models

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
import pandas as pd
import pickle
import time
from datasets import Dataset
from transformers import pipeline

In [16]:
def save_raw(model_name, data):
    with open(f'data/raw_scores_{model_name}.pkl', 'wb') as f:
        pickle.dump(data, f)

In [45]:
# Unified function for hugging face models
def compute_hf(model_name, short, df):
    dataset = Dataset.from_pandas(df.reset_index()[['text']])
    
    model = pipeline("text-classification", model = model_name, top_k = None, device = 'mps')

    s = time.time()
    scores = model(dataset['text'])

    if len(scores) != len(df):
        raise ValueError(f"Length mismatch: df has {len(df)} rows but senti_scores has {len(scores)} elements")

    scores = {id_val: score for id_val, score in zip(df['id'], scores)}
    
    total = time.time() - s
    print('Time', round(total))

    save_raw(short, scores)

## NRC

In [61]:
from nrclex import NRCLex

model_name = 'nrc'
print('NRC')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
#df = df.head(100000)
s = time.time()

scores = {}

for i, row in df.iterrows():
    model = NRCLex(row['text'])
    scores[row['id']] = model.top_emotions
    
total = time.time() - s
print('Time', total)

save_raw(model_name, scores)

NRC
Time 280.5289590358734


## Vader

In [62]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader_model = SentimentIntensityAnalyzer()

model_name = 'vader'
print('Vader')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
#df = df.head(100000)
s = time.time()

scores = {}

for i, row in df.iterrows():
    scores[row['id']] = vader_model.polarity_scores(row['text'])
    
total = time.time() - s
print('Time', total)

save_raw(model_name, scores)

Vader
Time 154.49059987068176


## Pysentimiento

In [30]:
from pysentimiento.preprocessing import preprocess_tweet
from pysentimiento import create_analyzer
senti_model = create_analyzer(task="sentiment", lang = 'en')
emo_model = create_analyzer(task="emotion", lang="en")

In [35]:
model_name = 'pysentimiento_senti'
print('Pysentimiento (senti)')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
df = df.head(100000)

s = time.time()
senti_scores = senti_model.predict(df['text'])

scores = {id_val: score for id_val, score in zip(df['id'], senti_scores)}

total = time.time() - s
print('Time', round(total))
save_raw(model_name, scores)

Pysentimiento (senti)


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Time 669


In [36]:
model_name = 'pysentimiento_emo'
print('Pysentimiento (emo)')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
df = df.head(100000)

s = time.time()
emo_scores = emo_model.predict(df['text'])

scores = {id_val: score for id_val, score in zip(df['id'], emo_scores)}

total = time.time() - s
print('Time', round(total))
save_raw(model_name, scores)

Pysentimiento (emo)


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Time 679


## Hartmann

In [57]:
model_name = 'j-hartmann/emotion-english-distilroberta-base'
print('Hartmann')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
df = df.head(100000)
compute_hf(model_name, 'hartmann', df)

Hartmann
Time 681


## Cardiff

In [58]:
model_name = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
print('Cardiff')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
df = df.head(100000)
compute_hf(model_name, 'cardif', df)

Cardiff
Time 844


## Siebert

In [59]:
model_name = 'siebert/sentiment-roberta-large-english'
print('Siebert')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
df = df.head(100000)
compute_hf(model_name, 'siebert', df)

Siebert
Time 1934


## LEIA

In [60]:
model_name = 'LEIA/LEIA-large'
print('LEIA')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
df = df.head(100000)
compute_hf(model_name, 'leia', df)

LEIA
Time 1826


## LIWC

Done via LIWC software

## LLMS

Results provided by Segun