## Computing raw scores produces but different models

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import pickle
import time
from datasets import Dataset
from transformers import pipeline

In [3]:
def save_raw(model_name, data):
    with open(f'data/raw_scores_{model_name}.pkl', 'wb') as f:
        pickle.dump(data, f)

In [4]:
# Unified function for hugging face models
def compute_hf(model_name, short, df):
    dataset = Dataset.from_pandas(df.reset_index()[['text']])
    
    model = pipeline("text-classification", model = model_name, top_k = None, device = 'mps')

    s = time.time()
    scores = model(dataset['text'])

    if len(scores) != len(df):
        raise ValueError(f"Length mismatch: df has {len(df)} rows but senti_scores has {len(scores)} elements")

    scores = {id_val: score for id_val, score in zip(df['id'], scores)}
    
    total = time.time() - s
    print('Time', round(total))

    save_raw(short, scores)

In [5]:
# Modified function to process a chunk and return scores
def compute_hf_chunk(model, chunk_df):
    dataset = Dataset.from_pandas(chunk_df.reset_index()[['text']])
    
    s = time.time()
    scores = model(dataset['text'])
    
    if len(scores) != len(chunk_df):
        raise ValueError(f"Length mismatch: chunk_df has {len(chunk_df)} rows but scores has {len(scores)} elements")
    
    scores_dict = {id_val: score for id_val, score in zip(chunk_df['id'], scores)}
    
    total = time.time() - s
    print('Time', round(total))
    
    return scores_dict

def compute_hf_chunks(model_name, short, df, chunk_size=100000):
    total_rows = len(df)
    total_chunks = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division
    all_scores = {}

    model = pipeline("text-classification", model=model_name, top_k=None, device='mps')
    
    # Process data in chunks
    for chunk_index in range(total_chunks):
        start_idx = chunk_index * chunk_size
        end_idx = min((chunk_index + 1) * chunk_size, total_rows)
        
        print(f"Processing chunk {chunk_index + 1}/{total_chunks} (rows {start_idx} to {end_idx-1})")
        
        # Get current chunk
        chunk_df = df.iloc[start_idx:end_idx]
        
        # Process the chunk
        chunk_scores = compute_hf_chunk(model, chunk_df)
        
        # Update all scores
        all_scores.update(chunk_scores)
        
        # Save intermediate results
        save_raw(f"{short}_chunk_{chunk_index+1}", chunk_scores)
    
    # Save the complete results
    save_raw(short, all_scores)

## NRC

In [61]:
from nrclex import NRCLex

model_name = 'nrc'
print('NRC')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
#df = df.head(100000)
s = time.time()

scores = {}

for i, row in df.iterrows():
    model = NRCLex(row['text'])
    scores[row['id']] = model.top_emotions
    
total = time.time() - s
print('Time', total)

save_raw(model_name, scores)

NRC
Time 280.5289590358734


## Vader

In [62]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader_model = SentimentIntensityAnalyzer()

model_name = 'vader'
print('Vader')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
#df = df.head(100000)
s = time.time()

scores = {}

for i, row in df.iterrows():
    scores[row['id']] = vader_model.polarity_scores(row['text'])
    
total = time.time() - s
print('Time', total)

save_raw(model_name, scores)

Vader
Time 154.49059987068176


## Pysentimiento

In [5]:
from pysentimiento.preprocessing import preprocess_tweet
from pysentimiento import create_analyzer
#senti_model = create_analyzer(task="sentiment", lang = 'en')
emo_model = create_analyzer(task="emotion", lang="en")

In [65]:
model_name = 'pysentimiento_senti'
print('Pysentimiento (senti)')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
#df = df.head(100000)

s = time.time()
senti_scores = senti_model.predict(df['text'])

scores = {id_val: score for id_val, score in zip(df['id'], senti_scores)}

total = time.time() - s
print('Time', round(total))
save_raw(model_name, scores)

Pysentimiento (senti)


Map:   0%|          | 0/1711514 [00:00<?, ? examples/s]

Time 14852


In [None]:
model_name = 'pysentimiento_emo'
print('Pysentimiento (emo)')

df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)
#df = df.head(100000)

s = time.time()
emo_scores = emo_model.predict(df['text'])

scores = {id_val: score for id_val, score in zip(df['id'], emo_scores)}

total = time.time() - s
print('Time', round(total))
save_raw(model_name, scores)

Pysentimiento (emo)


Map:   0%|          | 0/1711514 [00:00<?, ? examples/s]

## Hartmann

In [57]:
model_name = 'j-hartmann/emotion-english-distilroberta-base'
print('Hartmann')
df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)

Hartmann
Time 681


In [None]:
df = df.head(100000)
compute_hf(model_name, 'hartmann', df)

In [11]:
compute_hf_chunks(model_name, 'hartmann', df)

Hartmann
Processing chunk 1/18 (rows 0 to 99999)
Time 555
Processing chunk 2/18 (rows 100000 to 199999)
Time 2288
Processing chunk 3/18 (rows 200000 to 299999)
Time 503
Processing chunk 4/18 (rows 300000 to 399999)
Time 626
Processing chunk 5/18 (rows 400000 to 499999)
Time 505
Processing chunk 6/18 (rows 500000 to 599999)
Time 519
Processing chunk 7/18 (rows 600000 to 699999)
Time 510
Processing chunk 8/18 (rows 700000 to 799999)
Time 509
Processing chunk 9/18 (rows 800000 to 899999)
Time 513
Processing chunk 10/18 (rows 900000 to 999999)
Time 509
Processing chunk 11/18 (rows 1000000 to 1099999)
Time 532
Processing chunk 12/18 (rows 1100000 to 1199999)
Time 517
Processing chunk 13/18 (rows 1200000 to 1299999)
Time 499
Processing chunk 14/18 (rows 1300000 to 1399999)
Time 506
Processing chunk 15/18 (rows 1400000 to 1499999)
Time 509
Processing chunk 16/18 (rows 1500000 to 1599999)
Time 509
Processing chunk 17/18 (rows 1600000 to 1699999)
Time 504
Processing chunk 18/18 (rows 1700000 to

## Cardiff

In [6]:
model_name = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
print('Cardiff')
df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)

Cardiff


In [None]:
df = df.head(100000)
compute_hf(model_name, 'cardif', df)

In [7]:
compute_hf_chunks(model_name, 'cardif', df)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing chunk 1/18 (rows 0 to 99999)
Time 899
Processing chunk 2/18 (rows 100000 to 199999)
Time 863
Processing chunk 3/18 (rows 200000 to 299999)
Time 843
Processing chunk 4/18 (rows 300000 to 399999)
Time 826
Processing chunk 5/18 (rows 400000 to 499999)
Time 840
Processing chunk 6/18 (rows 500000 to 599999)
Time 835
Processing chunk 7/18 (rows 600000 to 699999)
Time 838
Processing chunk 8/18 (rows 700000 to 799999)
Time 834
Processing chunk 9/18 (rows 800000 to 899999)
Time 2188
Processing chunk 10/18 (rows 900000 to 999999)
Time 818
Processing chunk 11/18 (rows 1000000 to 1099999)
Time 854
Processing chunk 12/18 (rows 1100000 to 1199999)
Time 887
Processing chunk 13/18 (rows 1200000 to 1299999)
Time 892
Processing chunk 14/18 (rows 1300000 to 1399999)
Time 854
Processing chunk 15/18 (rows 1400000 to 1499999)
Time 857
Processing chunk 16/18 (rows 1500000 to 1599999)
Time 972
Processing chunk 17/18 (rows 1600000 to 1699999)
Time 849
Processing chunk 18/18 (rows 1700000 to 1711513)

## Siebert

In [6]:
model_name = 'siebert/sentiment-roberta-large-english'
print('Siebert')
df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)

Siebert


In [None]:
df = df.head(100000)
compute_hf(model_name, 'siebert', df)

In [7]:
compute_hf_chunks(model_name, 'siebert', df)

Processing chunk 1/18 (rows 0 to 99999)
Time 1891
Processing chunk 2/18 (rows 100000 to 199999)
Time 1873
Processing chunk 3/18 (rows 200000 to 299999)
Time 29154
Processing chunk 4/18 (rows 300000 to 399999)
Time 1781
Processing chunk 5/18 (rows 400000 to 499999)
Time 1797
Processing chunk 6/18 (rows 500000 to 599999)
Time 16084
Processing chunk 7/18 (rows 600000 to 699999)
Time 16484
Processing chunk 8/18 (rows 700000 to 799999)
Time 12824
Processing chunk 9/18 (rows 800000 to 899999)
Time 5783
Processing chunk 10/18 (rows 900000 to 999999)
Time 12954
Processing chunk 11/18 (rows 1000000 to 1099999)
Time 16190
Processing chunk 12/18 (rows 1100000 to 1199999)
Time 4335
Processing chunk 13/18 (rows 1200000 to 1299999)
Time 6724
Processing chunk 14/18 (rows 1300000 to 1399999)
Time 14920
Processing chunk 15/18 (rows 1400000 to 1499999)
Time 14647
Processing chunk 16/18 (rows 1500000 to 1599999)
Time 7155
Processing chunk 17/18 (rows 1600000 to 1699999)
Time 1841
Processing chunk 18/18 (

## LEIA

In [8]:
model_name = 'LEIA/LEIA-large'
print('LEIA')
df = pd.read_csv('data/dataset.csv', quoting=1, escapechar='\\', doublequote=True)

LEIA


In [None]:
df = df.head(100000)
compute_hf(model_name, 'leia', df)

In [9]:
compute_hf_chunks(model_name, 'leia', df)

Processing chunk 1/18 (rows 0 to 99999)
Time 1772
Processing chunk 2/18 (rows 100000 to 199999)
Time 1775
Processing chunk 3/18 (rows 200000 to 299999)
Time 1810
Processing chunk 4/18 (rows 300000 to 399999)
Time 1759
Processing chunk 5/18 (rows 400000 to 499999)
Time 1756
Processing chunk 6/18 (rows 500000 to 599999)
Time 1772
Processing chunk 7/18 (rows 600000 to 699999)
Time 1747
Processing chunk 8/18 (rows 700000 to 799999)
Time 1762
Processing chunk 9/18 (rows 800000 to 899999)
Time 1817
Processing chunk 10/18 (rows 900000 to 999999)
Time 4524
Processing chunk 11/18 (rows 1000000 to 1099999)
Time 1853
Processing chunk 12/18 (rows 1100000 to 1199999)
Time 1856
Processing chunk 13/18 (rows 1200000 to 1299999)
Time 2867
Processing chunk 14/18 (rows 1300000 to 1399999)
Time 1812
Processing chunk 15/18 (rows 1400000 to 1499999)
Time 1842
Processing chunk 16/18 (rows 1500000 to 1599999)
Time 1864
Processing chunk 17/18 (rows 1600000 to 1699999)
Time 1825
Processing chunk 18/18 (rows 170

## LIWC

Done via LIWC software

## LLMS

Results provided by Segun