# Reshape dataset to fit Llama 3.2 format

In [None]:
!python -m spacy download en_core_web_sm

In [1]:
import json
import re
import numpy as np
import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import spacy
nlp = spacy.load("en_core_web_sm")

from data import go_emotions, uk2us, nrc_emotions, emotag
#from functions import americanize

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Convert UK to US Spelling

In [4]:
# Function to replace British spellings with American spellings
def americanize(string):
    for british_spelling, american_spelling in uk2us.items():
        string = re.sub(f'(?<![a-zA-Z]){british_spelling}(?![a-zA-Z])', american_spelling, string)
    return string

In [None]:
df_train = go_emotions['train']
df_test = go_emotions['test']
df_val = go_emotions['val']

In [126]:
df_train['us_text'] = df_train['text'].parallel_apply(americanize)
df_test['us_text'] = df_test['text'].parallel_apply(americanize)
df_val['us_text'] = df_val['text'].parallel_apply(americanize)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5427), Label(value='0 / 5427'))), …

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


In [None]:
df_train.to_parquet("output/goemotions_train_us.parquet", index=False)
df_test.to_parquet("output/goemotions_test_us.parquet", index=False)
df_val.to_parquet("output/goemotions_val_us.parquet", index=False)

In [2]:
df_train = pd.read_parquet("output/goemotions_train_us.parquet")
df_test = pd.read_parquet("output/goemotions_test_us.parquet")
df_val = pd.read_parquet("output/goemotions_val_us.parquet")

## Create N-Grams for Dictionary-Based Labelling

In [9]:
# Create a dictionary mapping words to their associated emotions
nrc_emotions_grouped = nrc_emotions[nrc_emotions['association']!=0].groupby('word')['emotion'].apply(list).reset_index()
d_nrc_emotions = {i['word']:i['emotion'] for i in nrc_emotions_grouped.to_dict(orient='records')}
# Save to json
with open("data/nrc_lexicon/nrc_emotions.json", "w") as f:
    json.dump(d_nrc_emotions, f, indent=4)

In [4]:
def create_ngrams(text, n=2, dic=d_nrc_emotions):
    l_ngrams = []
    l_emotions = []
    for w, e in dic.items():
        if w in text:
            # Tokenize the text into words
            words = text.split()
            
            ngrams = []
            emotions = []
            
            # Loop through the words to find the specified word
            for i in range(len(words)):
                if words[i].lower() == w:
                    # Collect n-grams centered around the specified word
                    start = max(0, i - (n // 2))
                    end = min(len(words), i + (n // 2) + 1)
                    ngram = " ".join(words[start:end])

                    # Check for negation
                    doc = nlp(ngram)
                    if all(token.dep_ != "neg" for token in doc):
                        ngrams.append(ngram)
                        emotions.append(e)
            
            l_ngrams.append(ngrams)
            l_emotions.append(emotions)
    
    return (
        [x for x in l_ngrams if x], 
        [x for x in l_emotions if x]
    )

In [5]:
create_ngrams("I've never been this sad in my life")

([['this sad in']], [[['sadness']]])

In [7]:
df_train[['ngrams', 'emotions']] = pd.DataFrame(
    df_train['us_text'].parallel_apply(create_ngrams).tolist(), index=df_train.index
)
df_test[['ngrams', 'emotions']] = pd.DataFrame(
    df_test['us_text'].parallel_apply(create_ngrams).tolist(), index=df_test.index
)
df_val[['ngrams', 'emotions']] = pd.DataFrame(
    df_val['us_text'].parallel_apply(create_ngrams).tolist(), index=df_val.index
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5427), Label(value='0 / 5427'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=679), Label(value='0 / 679'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=679), Label(value='0 / 679'))), HB…

In [10]:
with open('data/go_emotions/emotions.txt', "r") as file:
    lines = file.readlines()

# Remove any trailing newline characters
lines = [line.strip() for line in lines]

d_go_emotions = {i:e for i,e in enumerate(lines)}

In [17]:
def get_emo(labels):
    return [d_go_emotions[label] for label in labels]

df_train['l_emotions'] = df_train['labels'].apply(lambda x: [d_go_emotions[label] for label in x])
df_train['emotions'] = df_train['l_emotions'].apply(str)

In [18]:
df_train

Unnamed: 0,text,labels,us_text,ngrams,emotions,l_emotions
0,My favourite food is anything I didn't have to...,[27],My favorite food is anything I didn't have to ...,"[[My favorite food], [favorite food is]]",['neutral'],[neutral]
1,"Now if he does off himself, everyone will thin...",[27],"Now if he does off himself, everyone will thin...",[[a laugh screwing]],['neutral'],[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,[2],WHY THE FUCK IS BAYLESS ISOING,[],['anger'],[anger]
3,To make her feel threatened,[14],To make her feel threatened,[],['fear'],[fear]
4,Dirty Southern Wankers,[3],Dirty Southern Wankers,[],['annoyance'],[annoyance]
...,...,...,...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,[18],Added you mate well I’ve just got the bow and ...,"[[so happily join], [the hunting aspect, you h...",['love'],[love]
43406,Always thought that was funny but is it a refe...,[6],Always thought that was funny but is it a refe...,[[Always thought that]],['confusion'],[confusion]
43407,What are you talking about? Anything bad that ...,[3],What are you talking about? Anything bad that ...,"[[Anything bad that], [[NAME] fault -], [only ...",['annoyance'],[annoyance]
43408,"More like a baptism, with sexy results!",[13],"More like a baptism, with sexy results!",[],['excitement'],[excitement]
