# Cargamos las librerías básicas.
Esto será común a todos los cuadernos de este proyecto.
- Se cargan las librerías básicas
    - os
    - subprocess
    - re
    - numpy as np
    - pandas as pd
    - csv
    - string # Para usar punctuation y eliminar los signos de puntuación
    - seaborn as sns
    - matplotlib as plt
    - ast # Para poder evaluar listas
    - spacy
    - tensorflow
    - displacy
    - train_test_split
- Se importan los modelos
    - SVC
    - RandomForestClassifier
- Se cargan las Stop Words
- Se carga del diccionario de NLP en_core_web_lg

In [19]:
!source odio_env/bin/activate

In [20]:
# Las importaciones (se harán en cada archivo de esta serie)
%run "import.py"

<spacy.lang.en.English object at 0x1614f1510>


In [21]:
# Leemos el dataset en su estado actual
df = pd.read_json('data/03_Despues_del_EDA.json', orient='split')
df

Unnamed: 0,Comments_Length,Lematized_comments,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate,Sentiment
0,122,"[people, step, case, people, situation, lump, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,16,"[law, enforcement, train, shoot, apprehend, tr...",1,1,0,0,0,0,0,0,0,0,0,0,1,1
2,44,"[not, reckon, black, life, matter, banner, hol...",1,1,0,0,1,0,0,0,0,0,0,0,1,0
3,52,"[large, number, people, like, police, officer,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,26,"[arab, dude, absolutely, right, shoot, 6, extr...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,"[remember, send, national, defence]",0,0,0,0,0,0,0,0,0,0,0,0,0,1
996,29,"[stat, don`t, represent, problem, race, bait, ...",1,0,0,0,0,1,1,0,0,0,0,0,1,0
997,9,"[quote, mother, ..., wow, hit, hard, accurate]",0,0,0,0,0,0,0,0,0,0,0,0,0,0
998,2,"[video, racist]",0,0,0,0,0,0,0,0,0,0,0,0,0,1


# Aplicamos el TF/IDF

In [18]:
# Convertir las listas de palabras en texto plano
df['Lematized_comments'] = df['Lematized_comments'].apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['Lematized_comments'])
df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df

Unnamed: 0,00,000,03,05,08,09,10,100,10th,11,...,youse,youth,youtu,youtube,ypu,yr,yup,zimmerman,zionist,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Separación de datos en entrenamiento y test

In [10]:
# Características (X)
X = df  # Ya que df ahora contiene la representación TF-IDF de tus comentarios
# Etiquetas (y)
y = original_df[['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism', 'IsHate']]
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train

Unnamed: 0,00,000,03,05,08,09,10,100,10th,11,...,youse,youth,youtu,youtube,ypu,yr,yup,zimmerman,zionist,zone
29,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,0.0,0.139435,0.0,0.0,0.0,0.0,0.121983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
695,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
557,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
836,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X_test

Unnamed: 0,00,000,03,05,08,09,10,100,10th,11,...,youse,youth,youtu,youtube,ypu,yr,yup,zimmerman,zionist,zone
521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
y_train

Unnamed: 0,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
29,0,0,0,0,0,0,0,0,0,0,0,0,0
535,1,1,0,1,1,0,0,0,0,0,0,0,1
695,1,1,0,0,0,0,0,0,0,0,0,0,1
557,1,1,0,0,0,0,0,0,0,0,0,0,1
836,1,1,0,1,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0,0,0,0,0,0,0,0,0,0,0,0,0
270,1,1,0,0,0,0,0,0,0,0,0,0,1
860,1,0,0,0,0,1,1,0,0,0,0,0,1
435,1,1,0,1,0,0,0,0,0,0,0,0,1


In [14]:
y_test

Unnamed: 0,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
521,0,0,0,0,0,0,0,0,0,0,0,0,0
737,0,0,0,0,0,0,0,0,0,0,0,0,0
740,0,0,0,0,0,0,0,0,0,0,0,0,0
660,1,1,0,1,0,0,0,0,0,0,0,0,1
411,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0,0,0,0,0,0,0,0,0,0,0,0,0
332,1,1,0,0,0,0,0,0,0,0,0,0,1
208,0,0,0,0,0,0,0,0,0,0,0,0,0
613,1,1,0,0,0,0,0,0,0,0,0,0,1


In [16]:
# Grabamos los conjuntos de entrenamiento y de test
ruta = "train_test/"
X_train_name = ruta + "TF_IDF_X_train.csv"
X_test_name = ruta + "TF_IDF_X_test.csv"
y_train_name = ruta + "TF_IDF_y_train.csv"
y_test_name = ruta + "TF_IDF_y_test.csv"

X_train.to_csv(X_train_name)
X_test.to_csv(X_test_name)
y_train.to_csv(y_train_name)
y_test.to_csv(y_test_name)