# Cargamos las librerías básicas.
Esto será común a todos los cuadernos de este proyecto.
- Se cargan las librerías básicas
    - os
    - subprocess
    - re
    - numpy as np
    - pandas as pd
    - csv
    - string # Para usar punctuation y eliminar los signos de puntuación
    - seaborn as sns
    - matplotlib as plt
    - ast # Para poder evaluar listas
    - spacy
    - tensorflow
    - displacy
    - train_test_split
- Se importan los modelos
    - SVC
    - RandomForestClassifier
- Se cargan las Stop Words
- Se carga del diccionario de NLP en_core_web_lg

In [5]:
!source odio_env/bin/activate

In [8]:
# Las importaciones (se harán en cada archivo de esta serie)
%run "import.py"

<spacy.lang.en.English object at 0x15feead10>


In [23]:
# Leemos el dataset en su estado actual
df = pd.read_json('data/03_Despues_del_EDA.json', orient='split')
df

Unnamed: 0,Comments_Length,Lematized_comments,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate,Sentiment
0,122,"[people, step, case, people, situation, lump, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,16,"[law, enforcement, train, shoot, apprehend, tr...",1,1,0,0,0,0,0,0,0,0,0,0,1,1
2,44,"[not, reckon, black, life, matter, banner, hol...",1,1,0,0,1,0,0,0,0,0,0,0,1,0
3,52,"[large, number, people, like, police, officer,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,26,"[arab, dude, absolutely, right, shoot, 6, extr...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,"[remember, send, national, defence]",0,0,0,0,0,0,0,0,0,0,0,0,0,1
996,29,"[stat, don`t, represent, problem, race, bait, ...",1,0,0,0,0,1,1,0,0,0,0,0,1,0
997,9,"[quote, mother, ..., wow, hit, hard, accurate]",0,0,0,0,0,0,0,0,0,0,0,0,0,0
998,2,"[video, racist]",0,0,0,0,0,0,0,0,0,0,0,0,0,1


# A partir de aquí trabajaremos sólo con la columna de datos lematizados

# El siguiente paso es añadirle una Bag of Words

In [24]:
# Crear una función para convertir la lista de lematizados en un solo texto
def join_lemmas(lemmas):
    return ' '.join(lemmas)
# Aplicar la función para crear una nueva columna con el texto concatenado
df['Lematized_Text'] = df['Lematized_comments'].apply(join_lemmas)
# Crear un vectorizador de CountVectorizer
vectorizer = CountVectorizer()
# Ajustar y transformar los datos lematizados concatenados
X_bow = vectorizer.fit_transform(df['Lematized_Text'])
# X_bow ahora contiene la representación BoW de tus datos
# Crear un DataFrame a partir de X_bow
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
# Concatenar el DataFrame bow_df al DataFrame original df
df = pd.concat([df, bow_df], axis=1)
df

Unnamed: 0,Comments_Length,Lematized_comments,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,...,youse,youth,youtu,youtube,ypu,yr,yup,zimmerman,zionist,zone
0,122,"[people, step, case, people, situation, lump, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16,"[law, enforcement, train, shoot, apprehend, tr...",1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,44,"[not, reckon, black, life, matter, banner, hol...",1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,52,"[large, number, people, like, police, officer,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,26,"[arab, dude, absolutely, right, shoot, 6, extr...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,"[remember, send, national, defence]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,29,"[stat, don`t, represent, problem, race, bait, ...",1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
997,9,"[quote, mother, ..., wow, hit, hard, accurate]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,2,"[video, racist]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Eliminamos la columna de lematizados y guardamos el BagOfWords en CSV

In [29]:
# Eliminar la columna de lematizados que ya no se necesita necesitas
try:
    df.drop('Lematized_comments', axis=1, inplace=True)
except:
    pass
# Eliminar la columna de lematizados si no la necesitas
try:
    df.drop('Lematized_Text', axis=1, inplace=True)
except:
    pass
df

Unnamed: 0,Comments_Length,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,...,youse,youth,youtu,youtube,ypu,yr,yup,zimmerman,zionist,zone
0,122,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,44,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,52,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,29,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
997,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
print(df.columns)

Index(['Comments_Length', 'IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative',
       'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist',
       ...
       'youse', 'youth', 'youtu', 'youtube', 'ypu', 'yr', 'yup', 'zimmerman',
       'zionist', 'zone'],
      dtype='object', length=3583)


# Separación en conjuntos de entrenamiento y test
Dado el tipo de datos del que partimos no vamos a hacer escalado.

In [28]:
# Características (X)
X = df.drop(['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism', 'IsHate'], axis=1)
# Etiquetas (y)
y = df[['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism', 'IsHate']]
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
X_train

Unnamed: 0,Comments_Length,Sentiment,00,000,03,05,08,09,10,100,...,youse,youth,youtu,youtube,ypu,yr,yup,zimmerman,zionist,zone
29,25,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
535,59,2,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
695,14,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
557,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
836,38,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,15,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
270,16,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,17,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
435,6,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X_test

Unnamed: 0,Comments_Length,Sentiment,00,000,03,05,08,09,10,100,...,youse,youth,youtu,youtube,ypu,yr,yup,zimmerman,zionist,zone
521,48,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
737,8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
740,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660,24,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
411,13,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
332,18,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
208,19,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
y_train

Unnamed: 0,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
29,0,0,0,0,0,0,0,0,0,0,0,0,0
535,1,1,0,1,1,0,0,0,0,0,0,0,1
695,1,1,0,0,0,0,0,0,0,0,0,0,1
557,1,1,0,0,0,0,0,0,0,0,0,0,1
836,1,1,0,1,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0,0,0,0,0,0,0,0,0,0,0,0,0
270,1,1,0,0,0,0,0,0,0,0,0,0,1
860,1,0,0,0,0,1,1,0,0,0,0,0,1
435,1,1,0,1,0,0,0,0,0,0,0,0,1


In [35]:
y_test

Unnamed: 0,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
521,0,0,0,0,0,0,0,0,0,0,0,0,0
737,0,0,0,0,0,0,0,0,0,0,0,0,0
740,0,0,0,0,0,0,0,0,0,0,0,0,0
660,1,1,0,1,0,0,0,0,0,0,0,0,1
411,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0,0,0,0,0,0,0,0,0,0,0,0,0
332,1,1,0,0,0,0,0,0,0,0,0,0,1
208,0,0,0,0,0,0,0,0,0,0,0,0,0
613,1,1,0,0,0,0,0,0,0,0,0,0,1


In [36]:
# Grabamos los conjuntos de entrenamiento y de test
ruta = "train_test/"
X_train_name = ruta + "BoW_X_train.csv"
X_test_name = ruta + "BoW_X_test.csv"
y_train_name = ruta + "BoW_y_train.csv"
y_test_name = ruta + "BoW_y_test.csv"

X_train.to_csv(X_train_name)
X_test.to_csv(X_test_name)
y_train.to_csv(y_train_name)
y_test.to_csv(y_test_name)