In [1]:
import pandas as pd
import spacy
import re
from collections import Counter

In [2]:
#To use parquet to load the HuggingFace data into a pandas dataframe, we need to install 3 dependencies: 
# pyarrow, fastparquet and huggingface_hub (no need to import them, just install them, they are dependency packages to pandas)

In [3]:
# Load the terms of the keywords from the txt file
filename = 'SocialMediaTerms.txt'
with open(filename, 'r') as file:
        text = file.read()
terms = text.splitlines()
print(terms)

['like', 'comment', 'post', 'follow', 'tweet', 'agradar', 'comentari', 'comentar', 'publicació', 'publicar', 'seguir', 'tuitar', 'tuitejar', 'tuit', 'piular', 'piulada', 'piulet', 'gustar', 'tuitear']


In [4]:
# Split list of terms into English and Catalan/Spanish (some tokens are equal in both languages)
eng_terms = terms[:5]
cat_sp_terms = terms[5:]
print(f"ENG terms: {', '.join(eng_terms)}")
print(f"CAT/SPA terms: {', '.join(cat_sp_terms)}")

ENG terms: like, comment, post, follow, tweet
CAT/SPA terms: agradar, comentari, comentar, publicació, publicar, seguir, tuitar, tuitejar, tuit, piular, piulada, piulet, gustar, tuitear


In [5]:
# Load HuggingFace dataset into pandas dataframe
df_corpus = pd.read_parquet("hf://datasets/projecte-aina/CaSERa-catalan-stance-emotions-raco/data/train-00000-of-00001.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Visualize data structure
df_corpus.head()

Unnamed: 0,id_conversation,id_reply,parent_text,reply_text,dynamic_stance,parent_emotion,reply_emotion
0,9310554,9310554_2_1,S'ha aturat l'activitat parlamentària. Hi ha d...,Theresa May diu que l'atacant era de nacionali...,2,"[no emotion, fear]","[surprise, anger, distrust]"
1,626929,626929_1_1,Hola a tots. El Rally Lisboa-Dakar està a punt...,"Males notícies en el Dakar, ja que ha mort un ...",4,"[anticipation, sadness, joy]","[sadness, disgust]"
2,1151534,1151534_2_1,Us heu trobat algu cop algun castellanoparlant...,Si a TV3 es sentíssin totes les varietats de l...,2,"[anticipation, anger, joy]","[sadness, anger]"
3,310430,310430_1_1,Com és pot apreciar i segons dades de la Duana...,Com voleu que la gent no begui cava??? Sí és u...,2,"[no emotion, anger]","[disgust, distrust, joy]"
4,2995973,2995973_1_1,Ara mateix estic pensant en allò que agrada ta...,"En Carretero ho diu ben clar, bé, de fet ja va...",2,"[anger, fear, distrust]","[anger, joy]"


In [7]:
# Merge text from 2 columns in the DF: parent_text and reply_text into a single pandas dataframe
posts_df = pd.DataFrame(df_corpus['parent_text'] + df_corpus['reply_text'], columns=['Text'])

In [8]:
# See new dataframe with only the text of the forum's posts
print(posts_df.shape)
posts_df.head()

(13999, 1)


Unnamed: 0,Text
0,S'ha aturat l'activitat parlamentària. Hi ha d...
1,Hola a tots. El Rally Lisboa-Dakar està a punt...
2,Us heu trobat algu cop algun castellanoparlant...
3,Com és pot apreciar i segons dades de la Duana...
4,Ara mateix estic pensant en allò que agrada ta...


In [9]:
# Load the Catalan model in spacy - to tokenise and lemmatise
nlp = spacy.load("ca_core_news_md")

In [10]:
# POS tags we want to keep in the texts - Noun, Verb, Proper Noun and 'Other' (in case the tagger does not know what to do with the English words)
pos_keep = ['NOUN', 'VERB', 'PROPN', 'X']

In [11]:
# Create function to tokenize/lemmatise a string of text and
# return the text with the lemmas (create a string from joining all lemmas by space)
# this is to make it easier to use the regex package to find matches
def tokenise_lemmatise_text(sentence: str) -> str:
    ''' Given a string of text, the function tokenises it and lemmatises
    it with a Catalan language pipeline
    sentence -> the text we want to process
    return -> a string with each element lemmatised. Lemmas separated by space
    '''
    # Apply the spacy pipeline to the lowercase text
    doc = nlp(sentence.lower())
    # Create a list with all the lemmas with the POS we want
    lemmas = [w.lemma_ for w in doc if w.pos_ in pos_keep]
    # Convert into a string for easier manipulation
    text = " ".join(lemmas)
    return text

In [12]:
# Function to find our keywords in the lemmatised texts
def search_keywords(text: str, wordlist: list) -> str:
    ''' Given a string of text, we try to find if it contains any of
    the keywords defined in the terms list
    text -> the text we want to search through
    wordlist -> the list of words we want to search for
    return -> a string with all the terms found in the text, separated by comma. If none is found, returns "NA"
    '''
    found_terms = []
    # Iterate over all the keyword terms from the datafile
    for word in wordlist:
        # See if a match is found in the text -> if so, add to list
        if re.search(word, text, flags=re.IGNORECASE) != None:
            found_terms.append(word)
            
    # Convert to string for easier manipulation and to unify return types
    return ", ".join(found_terms) if len(found_terms) > 0 else "NA"


In [None]:
# Function to split strings (rows) with multiple terms -> set to a new list
def split_strings_list(wordlist: list) -> list:
    ''' Given a list of strings, it splits the contents of each string into a list using the
    separator ", " defined previously.
    wordlist -> list of strings with list-like format, separated by comma and space
    return -> returns a list of all the individual words
    '''
    split_strings = []
    for item in wordlist:
        temp = item.split(', ') # Split the string by the previously defined separator: ", "
        split_strings.extend(temp) # Add result on the same level (no list of lists)
    return split_strings

In [14]:
# Apply the tokeniser-lemmatiser function to all rows of the dataframe
posts_df['Lemmatised'] = posts_df['Text'].apply(lambda row: tokenise_lemmatise_text(row))

In [15]:
# Apply the search function to all rows of the dataframe to see which ones contain any of our keywords
# First we apply with the English terms
posts_df['English_keywords'] = posts_df['Lemmatised'].apply(lambda row: search_keywords(row, eng_terms))
# Then we apply with the Catalan/Spanish terms
posts_df['Cat_Sp_keywords'] = posts_df['Lemmatised'].apply(lambda row: search_keywords(row, cat_sp_terms))

In [16]:
posts_df.head()

Unnamed: 0,Text,Lemmatised,English_keywords,Cat_Sp_keywords
0,S'ha aturat l'activitat parlamentària. Hi ha d...,aturar activitat haver video oficial commotion...,,
1,Hola a tots. El Rally Lisboa-Dakar està a punt...,holar rally lisboa-dakar punt acabar pilot mar...,,
2,Us heu trobat algu cop algun castellanoparlant...,trobar cop castellanoparlant dir entendre parl...,,seguir
3,Com és pot apreciar i segons dades de la Duana...,apreciar dada duana cava perjudicar declaració...,,agradar
4,Ara mateix estic pensant en allò que agrada ta...,pensar agradar pp empadronar persona casa fer ...,,agradar


In [17]:
# Filter rows of dataframe that contain English terms
posts_df[posts_df['English_keywords'] != "NA"].shape

(1333, 4)

In [18]:
posts_df[posts_df['English_keywords'] != "NA"].head()

Unnamed: 0,Text,Lemmatised,English_keywords,Cat_Sp_keywords
9,Davant la possibilitat realista que algun dels...,possibilitat pig acrònim portugal italy greece...,post,
18,Text rebut per correu electrònic (perdoneu la ...,text correu perdonar llargada oració mare déu ...,post,
23,Desrprés de tot l'escàndol que ha portat aques...,escàndol portar vídeo baixa agradar saber opin...,post,agradar
32,"Entrevista a Arnaldo Otegi al diari Gara: ""L'E...",entrevista arnaldo otegi diari gara estat assu...,post,publicar
38,Aldarulls després de la derrota dels Vancouver...,aldarull derrota vancouver canucks boston part...,post,


In [19]:
# Extract the English keywords found and see which ones appear and with which frequency
eng_results = posts_df[posts_df['English_keywords'] != "NA"]['English_keywords'].tolist() # Filter column (no "NA") and cast to list
eng_found_terms = split_strings_list(eng_results) # Split the rows that contain more than one keyword
Counter(eng_found_terms)

Counter({'post': 1300, 'like': 25, 'comment': 9, 'follow': 7, 'tweet': 4})

In [20]:
# Filter rows of dataframe that contain Catalan or Spanish terms
posts_df[posts_df['Cat_Sp_keywords'] != "NA"].shape

(3774, 4)

In [21]:
posts_df[posts_df['English_keywords'] != "NA"].head()

Unnamed: 0,Text,Lemmatised,English_keywords,Cat_Sp_keywords
9,Davant la possibilitat realista que algun dels...,possibilitat pig acrònim portugal italy greece...,post,
18,Text rebut per correu electrònic (perdoneu la ...,text correu perdonar llargada oració mare déu ...,post,
23,Desrprés de tot l'escàndol que ha portat aques...,escàndol portar vídeo baixa agradar saber opin...,post,agradar
32,"Entrevista a Arnaldo Otegi al diari Gara: ""L'E...",entrevista arnaldo otegi diari gara estat assu...,post,publicar
38,Aldarulls després de la derrota dels Vancouver...,aldarull derrota vancouver canucks boston part...,post,


In [22]:
# Extract the Catalan/Spanish keywords found and see which ones appear and with which frequency
cat_sp_results = posts_df[posts_df['Cat_Sp_keywords'] != "NA"]['Cat_Sp_keywords'].tolist() # Filter column (no "NA") and cast to list
cat_sp_found_terms = split_strings_list(cat_sp_results) # Split the rows that contain more than one keyword
Counter(cat_sp_found_terms)

Counter({'seguir': 1612,
         'agradar': 1539,
         'comentar': 707,
         'comentari': 376,
         'publicar': 239,
         'publicació': 66,
         'tuit': 42,
         'piulada': 38,
         'gustar': 9,
         'tuitar': 6,
         'tuitejar': 4,
         'piular': 4,
         'piulet': 2})