# Cargamos las librerías básicas.
Esto será común a todos los cuadernos de este proyecto.
- Se cargan las librerías básicas
    - os
    - subprocess
    - re
    - numpy as np
    - pandas as pd
    - csv
    - string # Para usar punctuation y eliminar los signos de puntuación
    - seaborn as sns
    - matplotlib as plt
    - ast # Para poder evaluar listas
    - spacy
    - tensorflow
    - displacy
    - train_test_split
- Se importan los modelos
    - SVC
    - RandomForestClassifier
- Se cargan las Stop Words
- Se carga del diccionario de NLP en_core_web_lg

In [1]:
# Las importaciones (se harán en cada archivo de esta serie)
!source odio_env/bin/activate
%run "import.py"

2023-11-10 23:49:02.184464: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<spacy.lang.en.English object at 0x156787f10>


In [2]:
# Leemos el dataset en su estado actual
df = pd.read_csv("data/01_clean_text.csv")
df

Unnamed: 0,Text sin stop words,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,"people step case , people situation . lump ...",0,0,0,0,0,0,0,0,0,0,0,0
1,Law enforcement trained shoot apprehend . tr...,1,1,0,0,0,0,0,0,0,0,0,0
2,\n nt reckon ' black lives matter ' banners he...,1,1,0,0,1,0,0,0,0,0,0,0
3,large number people like police officers . cal...,0,0,0,0,0,0,0,0,0,0,0,0
4,"Arab dude absolutely right , shot 6 extra time...",0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,remember sent national defence,0,0,0,0,0,0,0,0,0,0,0,0
996,Stats don`t represent problem . Race baiting a...,1,0,0,0,0,1,1,0,0,0,0,0
997,quote mother ... Wow hit hard . accurate .,0,0,0,0,0,0,0,0,0,0,0,0
998,video racist,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Define una función para tokenizar los comentarios
def tokenize_text(comment):
    doc = pln(comment)
    tokens = [token.text.strip() for token in doc if token.text.strip() != '\xa0' and token.text.strip() != '' and token.text.strip() != '"']
    return tokens

# Aplica la función a la columna de comentarios sin stop words
df['Tokenized_Comments'] = df['Text sin stop words'].apply(tokenize_text)
tokenized = df['Tokenized_Comments']
df.drop(["Tokenized_Comments", "Text sin stop words"], axis=1, inplace=True)
df.insert(0, "Tokenized_Comments", tokenized)
df

Unnamed: 0,Tokenized_Comments,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,"[people, step, case, ,, people, situation, ., ...",0,0,0,0,0,0,0,0,0,0,0,0
1,"[Law, enforcement, trained, shoot, apprehend, ...",1,1,0,0,0,0,0,0,0,0,0,0
2,"[nt, reckon, ', black, lives, matter, ', banne...",1,1,0,0,1,0,0,0,0,0,0,0
3,"[large, number, people, like, police, officers...",0,0,0,0,0,0,0,0,0,0,0,0
4,"[Arab, dude, absolutely, right, ,, shot, 6, ex...",0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"[remember, sent, national, defence]",0,0,0,0,0,0,0,0,0,0,0,0
996,"[Stats, don`t, represent, problem, ., Race, ba...",1,0,0,0,0,1,1,0,0,0,0,0
997,"[quote, mother, ..., Wow, hit, hard, ., accura...",0,0,0,0,0,0,0,0,0,0,0,0
998,"[video, racist]",0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# Añadimos la columna resumen IsHate que refleja si el comentario es de odio por alguna de las posibles causas
columnas_booleanas = [
    'IsToxic', 
    'IsAbusive', 
    'IsThreat',
    'IsProvocative',
    'IsObscene',
    'IsHatespeech',
    'IsRacist',
    'IsNationalist',
    'IsSexist',
    'IsHomophobic',
    'IsReligiousHate',
    'IsRadicalism'
]

# Usa la función any() para comprobar si alguna de las columnas de interés contiene un 1
df['IsHate'] = df[columnas_booleanas].any(axis=1).astype(int)
df

Unnamed: 0,Tokenized_Comments,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
0,"[people, step, case, ,, people, situation, ., ...",0,0,0,0,0,0,0,0,0,0,0,0,0
1,"[Law, enforcement, trained, shoot, apprehend, ...",1,1,0,0,0,0,0,0,0,0,0,0,1
2,"[nt, reckon, ', black, lives, matter, ', banne...",1,1,0,0,1,0,0,0,0,0,0,0,1
3,"[large, number, people, like, police, officers...",0,0,0,0,0,0,0,0,0,0,0,0,0
4,"[Arab, dude, absolutely, right, ,, shot, 6, ex...",0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"[remember, sent, national, defence]",0,0,0,0,0,0,0,0,0,0,0,0,0
996,"[Stats, don`t, represent, problem, ., Race, ba...",1,0,0,0,0,1,1,0,0,0,0,0,1
997,"[quote, mother, ..., Wow, hit, hard, ., accura...",0,0,0,0,0,0,0,0,0,0,0,0,0
998,"[video, racist]",0,0,0,0,0,0,0,0,0,0,0,0,0


#### Estado actual
En este punto se ha hecho lo siguiente:
- Se han tokenizado los comentarios.
- Se han eliminado las stop_words.
- Se han eliminado columnas duplicadas o con contenido irrelevante.
- Se han convertido los booleanos a numéricos.
- Se ha añadido una columna booleana al final que resume las demás booleanas. Es 0 si todas son 0 o 1 si alguna es 1.

In [8]:
# Lematizamos los comentarios
# Definir una función para lematizar los tokens
def lemmatize_tokens(token_list):
    # Convierte la lista de tokens en un string
    text = ' '.join(token_list)
    # Procesa el texto con spaCy
    doc = pln(text)
    # Extrae los lemas de cada token en el documento
    lemmas = [token.lemma_ for token in doc]
    return lemmas

# Aplica la función 'lemmatize_tokens' a la columna 'tokens' y crea una nueva columna 'lemmas'
df['Lematized_Comments'] = df['Tokenized_Comments'].apply(lemmatize_tokens)
lema = df['Lematized_Comments']
df.drop(['Lematized_Comments'], axis=1, inplace=True)
df.insert(1, "Lematized_comments", lema)
df

Unnamed: 0,Tokenized_Comments,Lematized_comments,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
0,"[people, step, case, ,, people, situation, ., ...","[people, step, case, ,, people, situation, ., ...",0,0,0,0,0,0,0,0,0,0,0,0,0
1,"[Law, enforcement, trained, shoot, apprehend, ...","[law, enforcement, train, shoot, apprehend, .,...",1,1,0,0,0,0,0,0,0,0,0,0,1
2,"[nt, reckon, ', black, lives, matter, ', banne...","[not, reckon, ', black, life, matter, ', banne...",1,1,0,0,1,0,0,0,0,0,0,0,1
3,"[large, number, people, like, police, officers...","[large, number, people, like, police, officer,...",0,0,0,0,0,0,0,0,0,0,0,0,0
4,"[Arab, dude, absolutely, right, ,, shot, 6, ex...","[arab, dude, absolutely, right, ,, shoot, 6, e...",0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"[remember, sent, national, defence]","[remember, send, national, defence]",0,0,0,0,0,0,0,0,0,0,0,0,0
996,"[Stats, don`t, represent, problem, ., Race, ba...","[stat, don`t, represent, problem, ., race, bai...",1,0,0,0,0,1,1,0,0,0,0,0,1
997,"[quote, mother, ..., Wow, hit, hard, ., accura...","[quote, mother, ..., wow, hit, hard, ., accura...",0,0,0,0,0,0,0,0,0,0,0,0,0
998,"[video, racist]","[video, racist]",0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# Guarda el DataFrame en formato JSON con orientación 'split'
df.to_json('data/02_Tokenizados_y_lematizados.json', orient='split')