### Loading libraries

In [None]:
import pandas as pd
import numpy as np
import emoji
from importlib import reload

import src.text_processing
reload(src.text_processing)

from src.text_processing import TextCleaner, EmojiConversion, SentimentAnalysis

### Loading data

In [None]:
df = pd.read_csv('../data/processed/data_cleaned.csv', encoding='utf-8')

### Clean data

In [157]:
cleaner = TextCleaner(df['comment'])

In [158]:
cleaner.to_lowercase()
cleaner.remove_unwanted_chars()
cleaner.remove_url()
cleaner.remove_punctuation()
cleaner.remove_stop_words()
cleaner.to_strip()

2025-07-19 03:06:57,489 - src.text_cleaning - INFO - Converting text to lowercase
2025-07-19 03:06:57,564 - src.text_cleaning - INFO - Text converted to lowercase successfully
2025-07-19 03:06:57,565 - src.text_cleaning - INFO - Removing unwanted characters
2025-07-19 03:06:58,307 - src.text_cleaning - INFO - Unwanted characters removed successfully
2025-07-19 03:06:58,310 - src.text_cleaning - INFO - Removing URLs from text
2025-07-19 03:06:58,446 - src.text_cleaning - INFO - URLs removed successfully
2025-07-19 03:06:58,448 - src.text_cleaning - INFO - Removing punctuation from text
2025-07-19 03:06:58,686 - src.text_cleaning - INFO - Punctuation removed successfully
2025-07-19 03:06:58,688 - src.text_cleaning - INFO - Removing stop words from text (language: spanish)
2025-07-19 03:06:59,321 - src.text_cleaning - INFO - Stop words removed successfully
2025-07-19 03:06:59,323 - src.text_cleaning - INFO - Stripping whitespace from text
2025-07-19 03:06:59,340 - src.text_cleaning - INFO

In [159]:
df['comment'] = cleaner.get_cleaned_text()

2025-07-19 03:06:59,359 - src.text_cleaning - INFO - Returning cleaned text


### Emojis Interpretation

In [160]:
list_emojis = df['comment'].apply(emoji.emoji_list).apply(lambda x: [emoji['emoji'] for emoji in x if emoji['emoji'] not in ['\U0001F3FB', '\U0001F3FC', '\U0001F3FD', '\U0001F3FE', '\U0001F3FF']])

In [161]:
total_emojis = np.hstack(list_emojis)
print(f'Total emojis: {len(total_emojis)}')

Total emojis: 7881


In [162]:
unique_emojis = set(total_emojis)
print(f'Unique emojis: {len(unique_emojis)}')

Unique emojis: 374


In [163]:
emojis_dict = {emo : emoji.EMOJI_DATA[emo]['en'] for emo in unique_emojis}

In [164]:
import json
with open('../data/emojis.json', 'w', encoding='utf-8') as f:
    json.dump(emojis_dict, f, ensure_ascii=False, indent=4)

> The interpretation of the emojis in this context (the death of an ex-president) is provided by GitHub Copilot in Spanish.

### Emojis Conversion

In [165]:
with open('../data/emojis_interpretation.json', 'r', encoding='utf-8') as f:
    emojis_dict = json.load(f)

In [None]:
emojconv = EmojiConversion(emojis_dict, df['comment'])
emojconv.replace_emoji()
df['comment'] = emojconv.get_text()

2025-07-19 03:07:06,265 - src.text_cleaning - INFO - Replacing emojis in text
2025-07-19 03:07:11,800 - src.text_cleaning - INFO - Emojis replaced successfully


In [194]:
df = df[df['comment'].str.split().str.len()!=0].reset_index(drop=True)

### Sentiment Analysis

In [212]:
lexicon = pd.read_csv('../data/espaniol_NRC.csv', index_col='Spanish Word')
lexicon.drop(columns=['anticipacion','positivo','confianza','negativo'], inplace=True)

In [None]:
sentiment_analyzer = SentimentAnalysis(lexicon, df['comment'])
sentiment_analyzer.process_feelings()
df['feelings'] = sentiment_analyzer.get_feelings()

2025-07-19 03:35:08,385 - src.text_processing - INFO - Getting feelings from text
2025-07-19 03:35:34,518 - src.text_processing - INFO - Feelings calculated successfully
2025-07-19 03:35:34,520 - src.text_processing - INFO - Returning feelings from text
2025-07-19 03:35:34,521 - src.text_processing - INFO - Getting feelings from text
2025-07-19 03:36:01,464 - src.text_processing - INFO - Feelings calculated successfully


### Export the dataset

In [219]:
df.to_csv('../data/processed/data_with_feelings.csv', index=False, encoding='utf-8')

In [217]:
df.head(5)

Unnamed: 0,post,comment,datetime,context,feelings
0,Velorio del ex presidente Alan García en la Ca...,análisis análisis,19/04/2019 9:16,Lectura de Carta de AG,neutral
1,"Exequias del ex presidente Alan García en ""La ...",súplica súplica súplica súplica súplica súpli...,18/04/2019 20:57,Velatorio Casa del Pueblo,neutral
2,🚨 Estamos en los exteriores del Hospital de Em...,fiesta,17/04/2019 7:37,Traslado a la Clinica,alegria
3,Velorio del ex presidente Alan García en la Ca...,‍♀️‍♀️‍♀️ indiferencia indiferencia indiferencia,19/04/2019 9:16,Lectura de Carta de AG,ira
4,Velorio del ex presidente Alan García en la Ca...,shock shock shock shock,19/04/2019 9:16,Lectura de Carta de AG,sorpresa
