### **1. Importar las librerias empleadas**


In [3]:
from datasets import load_dataset # Para cargar los datasets de Hugging Face
import nltk # Para importar el WordNetLemmatizer y la función word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string   # Para la eliminación de signos de puntuación en el procesamiento

### **2. Cargar los datos**

In [4]:
# Se cargan cada uno de los datasets de Hugging Face que se van a usar
paradetox = load_dataset("textdetox/multilingual_paradetox")
multilingual_toxicity = load_dataset("textdetox/multilingual_toxicity_dataset")
toxic_keywords = load_dataset("textdetox/multilingual_toxic_lexicon")
toxic_spans = load_dataset("textdetox/multilingual_toxic_spans")
paradetox_test_set = load_dataset("textdetox/multilingual_paradetox_test")

Generating en split: 100%|██████████| 400/400 [00:00<00:00, 39551.18 examples/s]
Generating ru split: 100%|██████████| 400/400 [00:00<00:00, 104759.39 examples/s]
Generating uk split: 100%|██████████| 400/400 [00:00<00:00, 83626.84 examples/s]
Generating de split: 100%|██████████| 400/400 [00:00<00:00, 140654.06 examples/s]
Generating es split: 100%|██████████| 400/400 [00:00<00:00, 122488.25 examples/s]
Generating am split: 100%|██████████| 400/400 [00:00<00:00, 177781.24 examples/s]
Generating zh split: 100%|██████████| 400/400 [00:00<00:00, 162664.49 examples/s]
Generating ar split: 100%|██████████| 400/400 [00:00<00:00, 114975.44 examples/s]
Generating hi split: 100%|██████████| 400/400 [00:00<00:00, 76273.94 examples/s]
Generating en split: 100%|██████████| 5000/5000 [00:00<00:00, 901884.49 examples/s]
Generating ru split: 100%|██████████| 5000/5000 [00:00<00:00, 573008.01 examples/s]
Generating uk split: 100%|██████████| 5000/5000 [00:00<00:00, 593169.85 examples/s]
Generating de

In [5]:
#Para ver que hay en cada dataset (diferentes idiomas y diferentes columnas)
print(paradetox)
print(multilingual_toxicity)
print(toxic_keywords)
print(toxic_spans)
print(paradetox_test_set)

DatasetDict({
    en: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    ru: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    uk: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    de: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    es: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    am: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    zh: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    ar: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
    hi: Dataset({
        features: ['toxic_sentence', 'neutral_sentence'],
        num_rows: 400
    })
})
DatasetDict({
    en: Dataset({
      

### **3. Extraemos los datos solo en inglés ["en"]***

In [9]:
# Para seleccionar solo los datos en inglés
paradetox_en = paradetox["en"]
multilingual_toxicity_en = multilingual_toxicity["en"]
toxic_keywords_en = toxic_keywords["en"]
toxic_spans_en = toxic_spans["en"]
paradetox_test_set_en = paradetox_test_set["en"]

Proximos pasos:
- Eliminar valores nulos
- Aplicar case folding
- Aplicar tokenization
- Eliminar stop words y signos de puntuación
- Aplicar lemmatization


### **4. Quitar valores Nulos***

In [10]:
paradetox_en = paradetox_en.filter(lambda x: all(v is not None for v in x.values()))
multilingual_toxicity_en = multilingual_toxicity_en.filter(lambda x: all(v is not None for v in x.values()))
toxic_keywords_en = toxic_keywords_en.filter(lambda x: all(v is not None for v in x.values()))
toxic_spans_en = toxic_spans_en.filter(lambda x: all(v is not None for v in x.values()))
paradetox_test_set_en = paradetox_test_set_en.filter(lambda x: all(v is not None for v in x.values()))

Filter: 100%|██████████| 400/400 [00:00<00:00, 19721.89 examples/s]
Filter: 100%|██████████| 5000/5000 [00:00<00:00, 263028.43 examples/s]
Filter: 100%|██████████| 3386/3386 [00:00<00:00, 355982.29 examples/s]
Filter: 100%|██████████| 991/991 [00:00<00:00, 158950.49 examples/s]
Filter: 100%|██████████| 600/600 [00:00<00:00, 145408.36 examples/s]


In [11]:
# Y transformamos los datos a pandas para facilitar el procesamiento
english_paradetox_df = paradetox_en.to_pandas()
english_multilingual_toxicity_df = multilingual_toxicity_en.to_pandas()
english_toxic_keywords_df = toxic_keywords_en.to_pandas()
english_toxic_spans_df = toxic_spans_en.to_pandas()
english_paradetox_test_set_df = paradetox_test_set_en.to_pandas()

In [12]:
english_paradetox_df.info()
english_multilingual_toxicity_df.info()
english_toxic_keywords_df.info()
english_toxic_spans_df.info()
english_paradetox_test_set_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   toxic_sentence    400 non-null    object
 1   neutral_sentence  400 non-null    object
dtypes: object(2)
memory usage: 6.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   toxic   5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3386 entries, 0 to 3385
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3386 non-null   object
dtypes: object(1)
memory usage: 26.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 2 columns):
 #   Column 

### **5. Descargar recursos necesarios para tokenization, lemmatization y eliminación de stop words**

In [18]:

nltk.download('wordnet')    # Diccionario empleado para la lemmatization
nltk.download('punkt')      # Modelo empleado para la tokenization
nltk.download('averaged_perceptron_tagger')     # Modelo empleado para identificar el tipo de palabra

lemmatizer = WordNetLemmatizer()    #Se usará el WordNetLemmatizer de NLTK

nltk.download('stopwords')  # Para descargar las stopwords en inglés
stop_words_english = nltk.corpus.stopwords.words('english')     

punctuation = set(string.punctuation)   # Para cargar signos de puntuación de la librería string

[nltk_data] Downloading package wordnet to /home/elenaa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/elenaa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/elenaa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/elenaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **6. Aplicar case folding en los datos**

In [19]:
# 1️ English Paradetox
cols = ['toxic_sentence', 'neutral_sentence']
for col in cols:
    english_paradetox_df[col + '_lower'] = english_paradetox_df[col].str.lower()

# 2️ English Multilingual Toxicity
for col in cols:
    english_multilingual_toxicity_df['text_lower'] = english_multilingual_toxicity_df['text'].str.lower()

# 3️ English Toxic Keywords
english_toxic_keywords_df['text_lower'] = english_toxic_keywords_df['text'].str.lower()

# 4️ English Toxic Spans
cols = ['Sentence', 'Negative Connotations']
for col in cols:
    english_toxic_spans_df[col + '_lower'] = english_toxic_spans_df[col].str.lower()

# 5️ English Paradetox Test Set
english_paradetox_test_set_df['text_lower'] = english_paradetox_test_set_df['text'].str.lower()

### **7. Aplicar tokenization a las columnas de texto**

In [21]:
nltk.download('punkt_tab')

cols = ['toxic_sentence_lower', 'neutral_sentence_lower']
for col in cols:
    english_paradetox_df[col.replace('_lower','_tokens')] = english_paradetox_df[col].apply(word_tokenize)

cols = ['text_lower']
for col in cols:
    english_multilingual_toxicity_df[col.replace('_lower','_tokens')] = english_multilingual_toxicity_df[col].apply(word_tokenize)              

cols = ['text_lower']
for col in cols:
    english_toxic_keywords_df[col.replace('_lower','_tokens')] = english_toxic_keywords_df[col].apply(word_tokenize)

cols = ['Sentence_lower', 'Negative Connotations_lower']
for col in cols:
    english_toxic_spans_df[col.replace('_lower','_tokens')] = english_toxic_spans_df[col].apply(word_tokenize)

cols = ['text_lower']
for col in cols:
    english_paradetox_test_set_df[col.replace('_lower','_tokens')] = english_paradetox_test_set_df[col].apply(word_tokenize)


[nltk_data] Downloading package punkt_tab to /home/elenaa/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### **8. Seleccionar solo las columnas tokenizadas**

In [22]:
english_paradetox_df = english_paradetox_df[['toxic_sentence_tokens', 'neutral_sentence_tokens']]
english_multilingual_toxicity_df = english_multilingual_toxicity_df[['text_tokens', 'toxic']]
english_toxic_keywords_df = english_toxic_keywords_df[['text_tokens']]
english_toxic_spans_df = english_toxic_spans_df[['Sentence_tokens', 'Negative Connotations_tokens']]
english_paradetox_test_set_df = english_paradetox_test_set_df[['text_tokens']]

print(english_paradetox_df)
print(english_multilingual_toxicity_df)
print(english_toxic_keywords_df)        
print(english_toxic_spans_df)
print(english_paradetox_test_set_df)

                                 toxic_sentence_tokens  \
0    [then, all, of, a, sudden, i, see, her, ,, she...   
1    [my, page, should, be, protected, first, so, t...   
2                 [you, made, a, mistake, you, ass, .]   
3    [you, know, more, than, these, idiots, ,, stay...   
4    [piss, me, off, ,, fuckin, jerk, ,, get, on, m...   
..                                                 ...   
395  [at, least, one, party, gives, a, crap, about,...   
396                 [parliament, is, a, lame, duck, .]   
397  [side, -, note, ,, why, the, fuck, does, every...   
398  [you, 're, not, being, helpful, ,, either, ,, ...   
399  [prove, my, comment, wrong, and, post, a, vide...   

                               neutral_sentence_tokens  
0    [all, of, a, sudden, i, see, her, ,, she, is, ...  
1    [my, page, should, be, protected, first, so, t...  
2                           [you, made, a, mistake, .]  
3    [you, know, more, than, these, people, ,, stay...  
4                 

### **9. Eliminar stop words y signos de puntuación**

In [23]:
#Se eliminan las stop words (sacadas de nltk.corpus.stopwords) y signos de puntuación(sacados de string.punctuation))
english_paradetox_df['toxic_sentence_tokens'] = english_paradetox_df['toxic_sentence_tokens'].apply(
    lambda tokens: [t for t in tokens if t.lower() not in stop_words_english and t not in punctuation]
)
english_paradetox_df['neutral_sentence_tokens'] = english_paradetox_df['neutral_sentence_tokens'].apply(
    lambda tokens: [t for t in tokens if t.lower() not in stop_words_english and t not in punctuation]
)
english_multilingual_toxicity_df['text_tokens'] = english_multilingual_toxicity_df['text_tokens'].apply(
    lambda tokens: [t for t in tokens if t.lower() not in stop_words_english and t not in punctuation]
)
english_toxic_keywords_df['text_tokens'] = english_toxic_keywords_df['text_tokens'].apply(
    lambda tokens: [t for t in tokens if t.lower() not in stop_words_english and t not in punctuation]
)   
english_toxic_spans_df['Sentence_tokens'] = english_toxic_spans_df['Sentence_tokens'].apply(
    lambda tokens: [t for t in tokens if t.lower() not in stop_words_english and t not in punctuation]
)
english_toxic_spans_df['Negative Connotations_tokens'] = english_toxic_spans_df['Negative Connotations_tokens'].apply(
    lambda tokens: [t for t in tokens if t.lower() not in stop_words_english and t not in punctuation]
)
english_paradetox_test_set_df['text_tokens'] = english_paradetox_test_set_df['text_tokens'].apply(
    lambda tokens: [t for t in tokens if t.lower() not in stop_words_english and t not in punctuation]
)
print(english_paradetox_df)

                                 toxic_sentence_tokens  \
0    [sudden, see, shes, got, big, phony, tits, eve...   
1    [page, protected, first, worthless, scum, like...   
2                                 [made, mistake, ass]   
3         [know, idiots, stay, course, truth, prevail]   
4                    [piss, fuckin, jerk, get, nerves]   
..                                                 ...   
395    [least, one, party, gives, crap, humans, birth]   
396                           [parliament, lame, duck]   
397  [side, note, fuck, everything, us, make, calle...   
398                  ['re, helpful, either, tiny-dick]   
399  [prove, comment, wrong, post, video, fucking, ...   

                               neutral_sentence_tokens  
0                                 [sudden, see, grown]  
1    [page, protected, first, unpleasant, people, l...  
2                                      [made, mistake]  
3         [know, people, stay, course, truth, prevail]  
4                 

### **10. Lemmatizar cada uno de los tokens**

In [24]:
# Función para lemmatizar listas de tokens (se usa el WordNetLemmatizer de NLTK importado previamente)
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(t) for t in tokens]

# Paradetox
english_paradetox_df['toxic_sentence_tokens'] = english_paradetox_df['toxic_sentence_tokens'].apply(lemmatize_tokens)
english_paradetox_df['neutral_sentence_tokens'] = english_paradetox_df['neutral_sentence_tokens'].apply(lemmatize_tokens)

# Multilingual Toxicity
english_multilingual_toxicity_df['text_tokens'] = english_multilingual_toxicity_df['text_tokens'].apply(lemmatize_tokens)

# Toxic Keywords
english_toxic_keywords_df['text_tokens'] = english_toxic_keywords_df['text_tokens'].apply(lemmatize_tokens)

# Toxic Spans
english_toxic_spans_df['Sentence_tokens'] = english_toxic_spans_df['Sentence_tokens'].apply(lemmatize_tokens)
english_toxic_spans_df['Negative Connotations_tokens'] = english_toxic_spans_df['Negative Connotations_tokens'].apply(lemmatize_tokens)

# Paradetox Test Set
english_paradetox_test_set_df['text_tokens'] = english_paradetox_test_set_df['text_tokens'].apply(lemmatize_tokens)

### **11. Guardar los datos procesados**

In [25]:
#Se guardan en la carpeta /datos/
english_paradetox_df.to_csv("datos/english_paradetox_preprocessed.csv", index=False)
english_multilingual_toxicity_df.to_csv("datos/english_multilingual_toxicity_preprocessed.csv", index=False)
english_toxic_keywords_df.to_csv("datos/english_toxic_keywords_preprocessed.csv", index=False)
english_toxic_spans_df.to_csv("datos/english_toxic_spans_preprocessed.csv", index=False)
english_paradetox_test_set_df.to_csv("datos/english_paradetox_test_set_preprocessed.csv", index=False)