### Importing libraries

In [157]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import string
import nltk
from nltk.corpus import stopwords
import emoji
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading datasets

In [158]:
df_1 = pd.read_csv('data/db_AGP_2019_sucesos.csv')
df_2 = pd.read_csv('data/postID_suceso.csv')

### Exploring dataset

In [159]:
df_1.head()

Unnamed: 0,created_time.1,created_time.2,message,id,object_link.connections.comments.message,FECHA Y HORA
0,"viernes, 19 de abril de 2019",30/12/1899 09:16:15,Velorio del ex presidente Alan García en la Ca...,71263708835_377868469727477,🧐🧐,19/04/2019 9:16
1,"jueves, 18 de abril de 2019",30/12/1899 20:57:16,"Exequias del ex presidente Alan García en ""La ...",71263708835_276236269920595,🥺🥺🥺🥺🥺🥺🥺🥺,18/04/2019 20:57
2,"miércoles, 17 de abril de 2019",30/12/1899 07:37:30,🚨 Estamos en los exteriores del Hospital de Em...,71263708835_310239669653341,🥳,17/04/2019 7:37
3,"viernes, 19 de abril de 2019",30/12/1899 09:16:15,Velorio del ex presidente Alan García en la Ca...,71263708835_377868469727477,🤷‍♀️🤷‍♀️🤷‍♀️,19/04/2019 9:16
4,"viernes, 19 de abril de 2019",30/12/1899 09:16:15,Velorio del ex presidente Alan García en la Ca...,71263708835_377868469727477,🤯🤯🤯🤯,19/04/2019 9:16


In [160]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55006 entries, 0 to 55005
Data columns (total 6 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   created_time.1                            55006 non-null  object
 1   created_time.2                            55006 non-null  object
 2   message                                   55006 non-null  object
 3   id                                        55006 non-null  object
 4   object_link.connections.comments.message  55006 non-null  object
 5   FECHA Y HORA                              55006 non-null  object
dtypes: object(6)
memory usage: 2.5+ MB


In [161]:
df_2.head()

Unnamed: 0,POST NRO,SUCESO
0,71263708835_310239669653341,Traslado a la Clinica
1,71263708835_10157689377373836,Confirmación de Intento de Suicidio
2,71263708835_2430897753596693,Confirmación de Intento de Suicidio
3,71263708835_10157689601593836,Confirmación de Intento de Suicidio
4,71263708835_426911824739777,Confirmación de Intento de Suicidio


In [162]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   POST NRO  87 non-null     object
 1   SUCESO    87 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB


In [163]:
df_1['id'].nunique()

87

### Cleaning data

In [164]:
# drop cols
df_1.drop(columns=['created_time.2','created_time.1'],inplace=True)

# merge the both datasets
df = df_1.merge(df_2,how='left',left_on = 'id',right_on='POST NRO')

# drop redundant cols
df.drop(columns=['id','POST NRO'],inplace=True)

In [165]:
# change the names of the columns
df.columns = ['post','comment','datetime','context']

# order the columns
df = df[['datetime','context','post','comment']]

In [166]:
# change the type of 'datetime'
df['datetime'] = pd.to_datetime(df['datetime'],format='%d/%m/%Y %H:%M')

In [167]:
# check nan values
df.isnull().sum()

datetime    0
context     0
post        0
comment     0
dtype: int64

In [168]:
# check duplicated values
df.duplicated().sum()

np.int64(1444)

In [169]:
# drop duplicated values
df.drop_duplicates(inplace=True)

In [170]:
# Reset index
df.reset_index(drop=True, inplace=True)

### Remove punctuation, stopwords and accents from text

In [171]:
punct = string.punctuation
df['comment'] = df['comment'].str.replace(f'[{punct}]','',regex=True).str.lower()

In [172]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['comment'] = df['comment'].apply(remove_stopwords)

In [173]:
import unicodedata

def remove_accents(text):
    text = unicodedata.normalize('NFKD', text)
    text_without_accents = ''.join(c for c in text if not unicodedata.combining(c))
    return text_without_accents

df['comment'] = df['comment'].apply(remove_accents)

### Create diccionary to replace the emojis

In [174]:
emoji_to_spanish = {
    "‼": "sorpresa ",
    "⁉": "duda ",
    "☝": "alegria ",
    "☠": "peligro ",
    "☹": "tristeza ",
    "☺": "feliz ",
    "♥": "amor ",
    "⚖": "justicia ",
    "⚠": "advertencia ",
    "⛪": "iglesia ",
    "✅": "verificado ",
    "✈": "avión ",
    "✊": "ira ",
    "✋": "mano ",
    "✌": "felicidad ",
    "✍": "escribir ",
    "✔": "correcto ",
    "✨": "brillo ",
    "❌": "incorrecto ",
    "❎": "cancelado ",
    "❗": "exclamacion ",
    "⭐": "estrella ",
    "🐀": "ira ",
    "🐁": "ira ",
    "🐑": "oveja ",
    "🐖": "ira ",
    "🐭": "ira",
    "🐷": "ira ",
    "👊": "puño ",
    "👋": "saludo ",
    "👌": "ok ",
    "👍": "aprobar ",
    "👎": "desaprobar ",
    "👏": "felicidad ",
    "👼": "angel ",
    "👿": "diablo ",
    "💀": "calavera ",
    "💓": "corazon",
    "💔": "corazon roto ",
    "💕": "corazones ",
    "💖": "corazon ",
    "💗": "corazon ",
    "💘": "corazon ",
    "💚": "corazon ",
    "💜": "corazon ",
    "💣": "bomba ",
    "💤": "dormir ",
    "💥": "explosion ",
    "💩": "ira ",
    "💪": "fuerza ",
    "💫": "mareo ",
    "💭": "pensamiento ",
    "💯": "perfecto ",
    "💰": "dinero ",
    "💲": "dolar ",
    "🔐": "cerrado ",
    "🔥": "fuego ",
    "🖕": "ira ",
    "😀": "sonrisa ",
    "😁": "carcajada ",
    "😂": "risa ",
    "😃": "alegria ",
    "😄": "risa ",
    "😅": "alivio ",
    "😆": "risa ",
    "😇": "angel ",
    "😈": "ira ",
    "😉": "guiño ",
    "😊": "contento ",
    "😋": "sabroso ",
    "😌": "tranquilo ",
    "😍": "enamorado ",
    "😎": "cool ",
    "😏": "satisfecho ",
    "😐": "neutral ",
    "😑": "indiferente ",
    "😒": "decepcionado ",
    "😓": "sudor ",
    "😔": "pensativo ",
    "😕": "confuso ",
    "😖": "preocupado ",
    "😗": "beso ",
    "😘": "beso ",
    "😙": "beso ",
    "😚": "beso ",
    "😛": "alegria ",
    "😜": "alegria ",
    "😝": "alegria ",
    "😞": "tristeza ",
    "😟": "angustiado ",
    "😠": "enojado ",
    "😡": "furioso ",
    "😢": "llorar ",
    "😣": "perseverante ",
    "😤": "determinacin ",
    "😥": "aliviado ",
    "😦": "preocupacion ",
    "😧": "shock ",
    "😨": "temeroso ",
    "😩": "agotado ",
    "😪": "somnoliento ",
    "😫": "agotamiento ",
    "😬": "incomodo ",
    "😭": "llorar ",
    "😮": "sorpresa ",
    "😯": "silencio ",
    "😰": "ansiedad ",
    "😱": "grito ",
    "😲": "impactado ",
    "😳": "avergonzado ",
    "😴": "dormido ",
    "😵": "mareado ",
    "😶": "sin palabras ",
    "😷": "enfermo ",
    "😸": "gato feliz ",
    "😹": "gato risa ",
    "😼": "gato malicioso ",
    "😿": "gato triste ",
    "🙁": "decepcionado ",
    "🙂": "feliz ",
    "🙃": "descontento ",
    "🙄": "descontento ",
    "🙅": "prohibido ",
    "🙆": "ok ",
    "🙇": "reverencia ",
    "🙈": "no ver ",
    "🙉": "no escuchar ",
    "🙊": "no hablar ",
    "🙋": "mano alzada ",
    "🙌": "victoria ",
    "🙏": "felicidad ",
    "🚫": "prohibido ",
    "🤐": "silencio ",
    "🤑": "dinero ",
    "🤒": "enfermo ",
    "🤓": "nerd ",
    "🤔": "pensativo ",
    "🤕": "herido ",
    "🤗": "abrazo ",
    "🤘": "rock ",
    "🤞": "suerte ",
    "🤟": "amor ",
    "🤡": "payaso ",
    "🤢": "asco ",
    "🤣": "carcajada ",
    "🤤": "baba ",
    "🤥": "mentira ",
    "🤦": "frustracion ",
    "🤧": "estornudo ",
    "🤨": "sospecha ",
    "🤩": "estrellas ",
    "🤪": "loco ",
    "🤫": "secreto ",
    "🤬": "molesto ",
    "🤭": "sorpresa ",
    "🤮": "vomito ",
    "🤯": "impacto ",
    "🤶": "navidad ",
    "🤷": "duda ",
    "🤹": "malabarista ",
    "🥀": "flor muerta ",
    "🥂": "brindis ",
    "🥃": "trago ",
    "🥳": "celebracion ",
    "🥴": "mareado ",
    "🥵": "calor ",
    "🥶": "frio ",
    "🥺": "tristeza ",
    "🦊": "ira ",
    "🦸": "heroe ",
    "🧐": "curioso ",
    "🧠": "cerebro ",
}

### Functions to handle text and emojis

In [175]:
def separate_text_from_emojis(text):
    list_text_without_emojis = []
    list_text_with_emojis = []
    
    for char in text:
        if char not in emoji.EMOJI_DATA:
            list_text_without_emojis.append(char)
        else:
            list_text_with_emojis.append(char)
    return "".join(list_text_with_emojis), "".join(list_text_without_emojis)

In [176]:
new_comment = df['comment'].apply(separate_text_from_emojis)

emojis_in_comment, text_in_comment = [], []

for a,b in new_comment:
    emojis_in_comment.append(a)
    text_in_comment.append(b)

In [177]:
def translate_emojis_to_spanish(text):
    if not isinstance(text, str):
        return ""
    
    list_text = []
    for char in text:
        if char in emoji.EMOJI_DATA:
            list_text.append(emoji_to_spanish.get(char, "desconocido "))
        else:
            list_text.append(char)
    
    return "".join(list_text)

In [178]:
emojis_text = []

for text in emojis_in_comment:
    emojis_text.append(translate_emojis_to_spanish(text))

In [179]:
text = []
for text_1, text_2 in zip(text_in_comment, emojis_text):
    text.append(text_1+ ' '+ text_2)

In [180]:
df['new_comment'] = pd.Series(text)
df['new_comment'] = df['new_comment'].apply(remove_accents)

In [181]:
def drop_emojis(texto):
    emojis = re.compile(
        "["                      # Rango de Unicode que incluye emojis
        "\U0001F600-\U0001F64F"  # Emoticonos
        "\U0001F300-\U0001F5FF"  # Símbolos y pictogramas misceláneos
        "\U0001F680-\U0001F6FF"  # Transportes y símbolos relacionados
        "\U0001F700-\U0001F77F"  # Alquimia
        "\U0001F780-\U0001F7FF"  # Geometría
        "\U0001F800-\U0001F8FF"  # Complemento
        "\U0001F900-\U0001F9FF"  # Símbolos y pictogramas
        "\U0001FA00-\U0001FA6F"  # Objetos misceláneos
        "\U00002700-\U000027BF"  # Otros símbolos
        "\U000024C2-\U0001F251"  # Símbolos adicionales
        "]",
        flags=re.UNICODE
    )
    return emojis.sub(r'', texto)

In [182]:
df["new_comment"] = df["new_comment"].apply(drop_emojis)

In [183]:
df["new_comment"] = df["new_comment"].str.strip()
df["new_comment"] = df["new_comment"].str.replace(r'\s+', ' ', regex=True)
df["new_comment"] = df["new_comment"].replace(r'^[\.\s]*$', '', regex=True)

In [184]:
df["new_comment"] = df["new_comment"].str.strip()
df["new_comment"] = df["new_comment"].str.replace(r'\s+', ' ', regex=True)
df["new_comment"] = df["new_comment"].str.replace(r'^[\.\s]*$', '', regex=True)
df["new_comment"] = df["new_comment"].str.replace(r'\d+', '', regex=True)
df["new_comment"] = df["new_comment"].str.replace('â€', '', regex=True)
df["new_comment"] = df["new_comment"].str.replace('httpswwwfacebookcom202767666958515posts442078129694133appfbl', '', regex=True)
df["new_comment"] = df["new_comment"].str.replace('httpsmfacebookcomstoryphpstoryfbidid', '', regex=True)

df["new_comment"] = df["new_comment"].str.replace("habran","abran ")
df["new_comment"] = df["new_comment"].str.replace("ahora","")

In [185]:
def clean_text(text):
    words = text.split()
    filtered_words = [
        word for word in words
        if word.lower() not in stop_words and len(word) > 2 and not word.lower().startswith('y')
    ]
    return ' '.join(filtered_words)

In [186]:
df['new_comment'] = df['new_comment'].apply(clean_text)

In [187]:
df.drop(df.loc[df["new_comment"] == ""].index,inplace=True)

In [188]:
df.reset_index(drop=True, inplace=True)

### Sentimental Analysis

In [189]:
lexicon = pd.read_csv('data/espaniol_NRC.csv', index_col='Spanish Word')
lexicon.drop(columns=['anticipacion','positivo','confianza','negativo'], inplace=True)

def feelings_in_text(text):
    feelings_count = {col: 0 for col in lexicon.columns}
    words = text.split()

    for word in words:
        if word in lexicon.index:
            feeling_values = lexicon.loc[word].values
            for i, value in enumerate(feeling_values):
                feelings_count[lexicon.columns[i]] += value

    feelings, count = sorted(feelings_count.items(), key=lambda item: item[1], reverse=True)[0]
    if count == 0:
        return 'neutral'
    else:
        return feelings

In [190]:
df['sentiments'] = df['new_comment'].apply(feelings_in_text)

In [191]:
df.drop(columns=['comment',],inplace=True)

### Save data

In [192]:
df.to_csv('data_clean.csv',index=False)