# Implementación de un preprocesamiento de texto

## Importación de librerías
Importamos las librerías necesarias para realizar el preprocesamiento de texto.

In [70]:
import re

import pandas as pd
import spacy
from tqdm import tqdm
from unidecode import unidecode

tqdm.pandas()

# Carga el modelo de lenguaje.
# En caso de no tenerlo instalado, ejecutar el siguiente comando:
# python -m spacy download en_core_web_sm
# Para español: es_core_news_sm
nlp = spacy.load('en_core_web_sm')

## Normalización de texto
La normalización de texto es un paso importante en el preprocesamiento de texto. Consiste en realizar una serie de transformaciones para que el texto sea más fácil de procesar. Lo que hicimos en este caso fue:
1. **Tokenizar el texto**: separar el texto en palabras.
2. **Limpiar el texto**: eliminar las URLs, menciones, hashtags, signos de puntuación, espacios, dígitos y palabras con menos de 2 caracteres.
3. **Lemmatizar el texto**: convertir las palabras a su forma base.

In [76]:
def normalize(text) -> str:
    URL_PATTERN = re.compile(
        r"""(?:https?:\/\/)?(?:www\.)?(?:[a-zA-Z0-9-]+\.[a-zA-Z]{2,6})(?:[-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)""")
    MENTION_PATTERN = re.compile(r"@\S+")
    HASHTAG_PATTERN = re.compile(r"#\S+")
    RT_PATTERN = re.compile(r"RT")
    LETTERS_PATTERN = re.compile(r"[^a-zA-Z]")

    # Tokenizar el texto
    tokens = nlp(text)

    # Limpiar el texto
    tokens = [token for token in tokens
              if not re.match(URL_PATTERN, token.text)
              and not re.match(MENTION_PATTERN, token.text)
              and not re.match(HASHTAG_PATTERN, token.text)
              and not re.match(RT_PATTERN, token.text)
              and not re.match(LETTERS_PATTERN, token.text)
              and not token.is_stop
              and not token.is_punct
              and not token.is_space
              and not token.is_digit
              and len(token.text) > 2]

    # Lemmatizar el texto
    text = " ".join([unidecode(token.lemma_.strip().lower()) for token in tokens])

    return text

In [77]:
sentences = ["I'm learning Python and I'm enjoying it. :) >.<",
             "I have a website at https://www.example.com with discounts.",
             "What do you think about the new product from @company? #opinions",
             "RT @user: Thanks for the retweet. Great article! 😃",
             "10 ways to improve your mental health. #health #wellness 🧘‍♂️"]
for sentence in sentences:
    print("\nTexto original:", sentence)
    print("Texto normalizado:", normalize(sentence))


Texto original: I'm learning Python and I'm enjoying it. :) >.<
Texto normalizado: learn python enjoy

Texto original: I have a website at https://www.example.com with discounts.
Texto normalizado: website discount

Texto original: What do you think about the new product from @company? #opinions
Texto normalizado: think new product opinion

Texto original: RT @user: Thanks for the retweet. Great article! 😃
Texto normalizado: thank retweet great article

Texto original: 10 ways to improve your mental health. #health #wellness 🧘‍♂️
Texto normalizado: way improve mental health health wellness


## Preprocesamiento de un dataset

In [78]:
# Cargo el dataset
df = pd.read_csv('../data/cyberbullying.csv')
# Renombra las columnas
df.columns = ['text', 'label']

# Normalizar el texto
df['text_preprocessed'] = df['text'].progress_apply(normalize)

# Eliminar las filas con texto preprocessado vacío
df = df[df['text_preprocessed'] != '']

# Guardar el dataset preprocesado
df.to_csv('../data/cyberbullying_preprocessed.csv', index=False)

# Display the first rows
print(df.head())

100%|██████████| 81417/81417 [08:20<00:00, 162.60it/s]


                                                text  label  \
0  In other words #katandandre, your food was cra...      0   
1  Why is #aussietv so white? #MKR #theblock #ImA...      0   
2  @XochitlSuckkks a classy whore? Or more red ve...      0   
3  @Jason_Gio meh. :P  thanks for the heads up, b...      0   
4  @RudhoeEnglish This is an ISIS account pretend...      0   

                                   text_preprocessed  
0             word katandandre food crapilicious mkr  
1  aussietv white mkr theblock imacelebrityau tod...  
2                    classy whore red velvet cupcake  
3        meh thank head concerned angry dude twitter  
4  isis account pretend kurdish account like isla...  


In [79]:
df = pd.read_csv('../data/cyberbullying_preprocessed.csv')
df.head()

Unnamed: 0,text,label,text_preprocessed
0,"In other words #katandandre, your food was cra...",0,word katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,0,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,0,classy whore red velvet cupcake
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0,meh thank head concerned angry dude twitter
4,@RudhoeEnglish This is an ISIS account pretend...,0,isis account pretend kurdish account like isla...
