<a href="https://colab.research.google.com/github/laiamr/tfm/blob/main/Scripts/TFM_Pipeline_1_DataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas spacy spacy-langdetect

In [None]:
import pandas as pd
import spacy
from spacy.language import Language  # For custom pipeline components
from spacy_langdetect import LanguageDetector  # For language detection
import re
from collections import Counter

**CONSTANTS and SPACY SETUP**

In [None]:
# File names
dataFile = 'https://raw.githubusercontent.com/laiamr/tfm/refs/heads/main/Data/CAT_Tweets.csv'
lexiconFile = 'https://raw.githubusercontent.com/laiamr/tfm/refs/heads/main/Data/WordList.csv'
annotationSampleFile = 'https://raw.githubusercontent.com/laiamr/tfm/refs/heads/main/Data/annotation_sample.csv'
tempAnnotatedSample = 'https://raw.githubusercontent.com/laiamr/tfm/refs/heads/main/Data/manually_annotated_sample.csv'
definitiveAnnotatedSample = 'https://raw.githubusercontent.com/laiamr/tfm/refs/heads/main/Data/annotated_sample.csv'

In [None]:
# FUNCTIONALITY NEEDED TO DETECT THE LANGUAGE IN THE TWEETS
# Load spacy model for Catalan
nlp = spacy.load("ca_core_news_md")

# Add language detector functionality into the pipeline
# Custom language detector factory function
@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector() # Create the detector component

# Add language detector to the spaCy pipeline
nlp.add_pipe("language_detector", last=True)

# Function to check if the text is in catalan
def is_catalan(text):
    doc = nlp(text)  # Process the text with the spaCy pipeline
    detect_language = doc._.language  # Access language detection results
    #print(detect_language)
    return int(detect_language['language'] == 'ca' and detect_language['score'] > 0.9)  # Check if detected language is catalan and with high confidence

**LOAD THE TWO DATA FILES**

In [None]:
# Load the lexicon file - contains keywords and type of borrowing
df_lex = pd.read_csv(folderName+lexiconFile, sep=';')
print(df_lex.shape)
df_lex.head()

(117, 7)


Unnamed: 0,original_lemma,loanword_lemma,loanword_form,loanword_pos,loanword_features,loanword_type,loanword_category
0,comment,comment,comment,"noun, verb",lemma,direct borrowing,foreign lexeme
1,tag,tag,tag,"noun, verb",lemma,direct borrowing,foreign lexeme
2,tag,tag,tags,noun,plural,direct borrowing,foreign lexeme
3,story,story,story,noun,lemma,direct borrowing,foreign lexeme
4,like,like,like,"noun, verb",lemma,direct borrowing,foreign lexeme


In [None]:
# Load the data file - contains the tweets
df = pd.read_csv(folderName+dataFile, sep=';', index_col=0)
print(df.shape)
df.head()

(15747, 4)


Unnamed: 0,id,searchQuery,text,timestamp
0,1619677524967190528,repiulet,"Un ruzi*, que he barrat tot d'una, vota pel me...",2023-01-29 12:43:00+00:00
1,1620168513376894976,retuitar,podeu demanar la dimissi√≥ de Sigfrid Gras sens...,2023-01-30 21:14:00+00:00
2,1619742793383170048,retuitar,"Perqu√®, retuitar? Perque fa falta",2023-01-29 17:02:00+00:00
3,1619666435776864256,retuitar,"En contra de retuitar genocides, per molt suc√≥...",2023-01-29 11:59:00+00:00
4,1619642801406509056,retuitar,Retuit si tu tamb√© creus que tampoc s'ha de re...,2023-01-29 10:25:00+00:00


**PRE-PROCESSING: DATA CLEANING**

In [None]:
# Remove rows that have empty text column (NA)
df = df.dropna(axis=0, subset=['text'], ignore_index=True)
print(df.shape)
df.head()

In [None]:
# Convert to lowercase both searchQuery and text
df['searchQuery'] = df['searchQuery'].apply(str.lower)
df['text'] = df['text'].apply(str.lower)

In [None]:
# Clean text: remove newline chars, links, hashtags
def cleanText(t: str):
    t = re.sub(r'\n', ' ', t) # Remove newline chars
    t = re.sub(r'\s\w*\.?(\w*\/)+\w*', '', t) # Remove links
    t = re.sub(r'#[\w\\]*\s', '', t) # Remove hashtags
    t = re.sub(r'\s{2,}', ' ', t) # Remove duplicated spaces
    return t

In [None]:
# Clean text: remove newline chars, links, hashtags
df['cleanText'] = df['text'].apply(cleanText)

In [None]:
print(df.shape)
df.head()

(15584, 5)


Unnamed: 0,id,searchQuery,text,timestamp,cleanText
0,1619677524967190528,repiulet,"un ruzi*, que he barrat tot d'una, vota pel me...",2023-01-29 12:43:00+00:00,"un ruzi*, que he barrat tot d'una, vota pel me..."
1,1620168513376894976,retuitar,podeu demanar la dimissi√≥ de sigfrid gras sens...,2023-01-30 21:14:00+00:00,podeu demanar la dimissi√≥ de sigfrid gras sens...
2,1619742793383170048,retuitar,"perqu√®, retuitar? perque fa falta",2023-01-29 17:02:00+00:00,"perqu√®, retuitar? perque fa falta"
3,1619666435776864256,retuitar,"en contra de retuitar genocides, per molt suc√≥...",2023-01-29 11:59:00+00:00,"en contra de retuitar genocides, per molt suc√≥..."
4,1619642801406509056,retuitar,retuit si tu tamb√© creus que tampoc s'ha de re...,2023-01-29 10:25:00+00:00,retuit si tu tamb√© creus que tampoc s'ha de re...


**PRE-PROCESSING: FILTERING**

In [None]:
# Language detection
# Add row to indicate if the tweet is written in Catalan or not (1 - CAT; 0 - NOT CAT)
df['is_catalan'] = df['text'].apply(is_catalan)

In [None]:
print(df.shape)
df.head()

(15584, 6)


Unnamed: 0,id,searchQuery,text,timestamp,cleanText,is_catalan
0,1619677524967190528,repiulet,"un ruzi*, que he barrat tot d'una, vota pel me...",2023-01-29 12:43:00+00:00,"un ruzi*, que he barrat tot d'una, vota pel me...",1
1,1620168513376894976,retuitar,podeu demanar la dimissi√≥ de sigfrid gras sens...,2023-01-30 21:14:00+00:00,podeu demanar la dimissi√≥ de sigfrid gras sens...,1
2,1619742793383170048,retuitar,"perqu√®, retuitar? perque fa falta",2023-01-29 17:02:00+00:00,"perqu√®, retuitar? perque fa falta",1
3,1619666435776864256,retuitar,"en contra de retuitar genocides, per molt suc√≥...",2023-01-29 11:59:00+00:00,"en contra de retuitar genocides, per molt suc√≥...",1
4,1619642801406509056,retuitar,retuit si tu tamb√© creus que tampoc s'ha de re...,2023-01-29 10:25:00+00:00,retuit si tu tamb√© creus que tampoc s'ha de re...,1


In [None]:
df[df['is_catalan'] == 0].shape # Not Catalan

(5896, 6)

In [None]:
df[df['is_catalan'] == 1].shape # Catalan

(9688, 6)

In [None]:
# Filter - keep only the rows that are in Catalan
df_cat = df[df['is_catalan'] == 1]
df_cat.drop('is_catalan', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat.drop('is_catalan', axis=1, inplace=True)


In [None]:
print(df_cat.shape)
df_cat.head()

(9688, 5)


Unnamed: 0,id,searchQuery,text,timestamp,cleanText
0,1619677524967190528,repiulet,"un ruzi*, que he barrat tot d'una, vota pel me...",2023-01-29 12:43:00+00:00,"un ruzi*, que he barrat tot d'una, vota pel me..."
1,1620168513376894976,retuitar,podeu demanar la dimissi√≥ de sigfrid gras sens...,2023-01-30 21:14:00+00:00,podeu demanar la dimissi√≥ de sigfrid gras sens...
2,1619742793383170048,retuitar,"perqu√®, retuitar? perque fa falta",2023-01-29 17:02:00+00:00,"perqu√®, retuitar? perque fa falta"
3,1619666435776864256,retuitar,"en contra de retuitar genocides, per molt suc√≥...",2023-01-29 11:59:00+00:00,"en contra de retuitar genocides, per molt suc√≥..."
4,1619642801406509056,retuitar,retuit si tu tamb√© creus que tampoc s'ha de re...,2023-01-29 10:25:00+00:00,retuit si tu tamb√© creus que tampoc s'ha de re...


In [None]:
# From the lexicon file, we obtain all the keywords we want to search for in the texts
kw_list = list(set(df_lex['loanword_form'].tolist()))
len(kw_list)

117

In [None]:
# Initialize list to store rows that contain the keywords (if a row contains more than one, it will be repeated for each word)
list_df_kw = []

# Iterate over each row of the dataframe
for _, row in df_cat.iterrows():
    #print(f'---{i}---')
    #print(row['cleanText'])
    #print('------')
    # Iterate over each keyword to see if any is found in the text
    for word in kw_list:
        pattern = f"\\b{word}\\b"
        r = re.findall(pattern, row['cleanText'])
        if r:
            #print(r)
            # Copy each row into a new list, adding the word found in the text (new column in the final DF)
            list_df_kw.append(row.to_list() + [word])

#list_df_kw

In [None]:
# Create new DataFrame with the searches of keywords in the tweets
# Column Names: id, searchQuery, text, timestamp, cleanText, foundWord
df_filt = pd.DataFrame(list_df_kw, columns=['id', 'searchQuery', 'text', 'timestamp', 'cleanText', 'foundWord'])
df_filt.head(10)

Unnamed: 0,id,searchQuery,text,timestamp,cleanText,foundWord
0,1619677524967190528,repiulet,"un ruzi*, que he barrat tot d'una, vota pel me...",2023-01-29 12:43:00+00:00,"un ruzi*, que he barrat tot d'una, vota pel me...",repiulet
1,1620168513376894976,retuitar,podeu demanar la dimissi√≥ de sigfrid gras sens...,2023-01-30 21:14:00+00:00,podeu demanar la dimissi√≥ de sigfrid gras sens...,retuitar
2,1619742793383170048,retuitar,"perqu√®, retuitar? perque fa falta",2023-01-29 17:02:00+00:00,"perqu√®, retuitar? perque fa falta",retuitar
3,1619666435776864256,retuitar,"en contra de retuitar genocides, per molt suc√≥...",2023-01-29 11:59:00+00:00,"en contra de retuitar genocides, per molt suc√≥...",retuitar
4,1619642801406509056,retuitar,retuit si tu tamb√© creus que tampoc s'ha de re...,2023-01-29 10:25:00+00:00,retuit si tu tamb√© creus que tampoc s'ha de re...,retuit
5,1619642801406509056,retuitar,retuit si tu tamb√© creus que tampoc s'ha de re...,2023-01-29 10:25:00+00:00,retuit si tu tamb√© creus que tampoc s'ha de re...,retuitar
6,1619493816993718272,retuitar,"@kanen49 si us plau, deixa de retuitar aquests...",2023-01-29 00:33:00+00:00,"@kanen49 si us plau, deixa de retuitar aquests...",retuitar
7,1620132857305001984,share,aneu perdent llen√ßols. al final tindreu conten...,2023-01-30 18:52:00+00:00,aneu perdent llen√ßols. al final tindreu conten...,perfil
8,1620132857305001984,share,aneu perdent llen√ßols. al final tindreu conten...,2023-01-30 18:52:00+00:00,aneu perdent llen√ßols. al final tindreu conten...,share
9,1620206220014665728,post,volv√≠ a borrar mis post en ig,2023-01-30 23:43:00+00:00,volv√≠ a borrar mis post en ig,post


In [None]:
df_filt.shape

(7041, 6)

In [None]:
# Save clean dataset into csv
df_filt.to_csv(folderName+"cleanDataset.csv", sep=";", index=False)

In [None]:
# Extract a random sample of 300 tweets to annotate manually (to use as Gold labels) (use random_sate=42 for reproducibility)
df_sample = df_filt.sample(300, random_state=42)
df_sample.head()

Unnamed: 0,id,searchQuery,text,timestamp,cleanText,foundWord
1741,1620047577906298880,m'agrada,no m'agrada de fer servir el mot 'correcte'. d...,2023-01-30 13:13:00+00:00,no m'agrada de fer servir el mot 'correcte'. d...,m'agrada
457,1620099264805486592,respondre,si la majoria de catalunya decideix que vol ro...,2023-01-30 16:38:00+00:00,si la majoria de catalunya decideix que vol ro...,respondre
177,1620116848825217024,retuit,et segueixo i no puc fer retuit a les teves pi...,2023-01-30 17:48:00+00:00,et segueixo i no puc fer retuit a les teves pi...,retuit
4113,1652264071687806977,tuits,s√≠.\nsolen ser tuits patrocinats. \nde pagament.,2023-04-29T10:50:00.000Z,s√≠. solen ser tuits patrocinats. de pagament.,tuits
3612,1652412536573108225,amic,gr√†cies amic!! ‚úäüèª,2023-04-29T20:40:00.000Z,gr√†cies amic!! ‚úäüèª,amic


In [None]:
# Save to csv
df_sample.to_csv(folderName+annotationSampleFile, sep=";", index=False)

In [None]:
# Load annotated dataset sample
# To avoid encoding issues, we only select the id, foundWord and the socialMediaSense columns of the annotated file
df_annot = pd.read_csv(folderName+tempAnnotatedSample, sep=";")[['id', 'foundWord', 'socialMediaSense']]

# We merge the labels with the existing DF by means of the id
df_annot_sample = pd.merge(df_sample, df_annot, how='inner', on=['id', 'foundWord'])
# We convert the socialMediaSense values from float64 into int64
df_annot_sample['socialMediaSense'] = df_annot_sample['socialMediaSense'].astype('Int64')
df_annot_sample.head()

Unnamed: 0,id,searchQuery,text,timestamp,cleanText,foundWord,socialMediaSense
0,1620047577906298880,m'agrada,no m'agrada de fer servir el mot 'correcte'. d...,2023-01-30 13:13:00+00:00,no m'agrada de fer servir el mot 'correcte'. d...,m'agrada,0
1,1620099264805486592,respondre,si la majoria de catalunya decideix que vol ro...,2023-01-30 16:38:00+00:00,si la majoria de catalunya decideix que vol ro...,respondre,0
2,1620116848825217024,retuit,et segueixo i no puc fer retuit a les teves pi...,2023-01-30 17:48:00+00:00,et segueixo i no puc fer retuit a les teves pi...,retuit,1
3,1652264071687806977,tuits,s√≠.\nsolen ser tuits patrocinats. \nde pagament.,2023-04-29T10:50:00.000Z,s√≠. solen ser tuits patrocinats. de pagament.,tuits,1
4,1652412536573108225,amic,gr√†cies amic!! ‚úäüèª,2023-04-29T20:40:00.000Z,gr√†cies amic!! ‚úäüèª,amic,0


In [None]:
Counter(df_annot_sample['socialMediaSense'])

Counter({np.int64(0): 195, np.int64(1): 105})

In [None]:
print(195/300, 105/300)

0.65 0.35


In [None]:
# Save the final annotated sample (with valid encoding) to a csv to be processed in the WSD part
df_annot_sample.to_csv(folderName+definitiveAnnotatedSample, sep=";", index=False)