# Cargamos las librerías básicas.
Esto será común a todos los cuadernos de este proyecto.
- Se cargan las librerías básicas
    - os
    - subprocess
    - re
    - numpy as np
    - pandas as pd
    - csv
    - string # Para usar punctuation y eliminar los signos de puntuación
    - seaborn as sns
    - matplotlib as plt
    - ast # Para poder evaluar listas
    - spacy
    - tensorflow
    - displacy
    - train_test_split
- Se importan los modelos
    - SVC
    - RandomForestClassifier
- Se cargan las Stop Words
- Se carga del diccionario de NLP en_core_web_lg

In [1]:
!source odio_env/bin/activate

In [2]:
# Las importaciones (se harán en cada archivo de esta serie)
%run "import.py"

2023-11-11 12:24:29.064441: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<spacy.lang.en.English object at 0x14ec2aa10>


In [6]:
# Leemos el dataset en su estado actual
df = pd.read_json('data/03_Despues_del_EDA.json', orient='split')
df

Unnamed: 0,Comments_Length,Lematized_comments,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate,Sentiment
0,122,"[people, step, case, people, situation, lump, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,16,"[law, enforcement, train, shoot, apprehend, tr...",1,1,0,0,0,0,0,0,0,0,0,0,1,1
2,44,"[not, reckon, black, life, matter, banner, hol...",1,1,0,0,1,0,0,0,0,0,0,0,1,0
3,52,"[large, number, people, like, police, officer,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,26,"[arab, dude, absolutely, right, shoot, 6, extr...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,"[remember, send, national, defence]",0,0,0,0,0,0,0,0,0,0,0,0,0,1
996,29,"[stat, don`t, represent, problem, race, bait, ...",1,0,0,0,0,1,1,0,0,0,0,0,1,0
997,9,"[quote, mother, ..., wow, hit, hard, accurate]",0,0,0,0,0,0,0,0,0,0,0,0,0,0
998,2,"[video, racist]",0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [7]:
# Obtén los embeddings de palabras y agrega una nueva columna llamada "Embeddings"
embeddings = []

for lemmatized_comment in df['Lematized_comments']:
    # Procesar el comentario lematizado con spaCy
    doc = pln(" ".join(lemmatized_comment))
    
    # Obtener el vector de embeddings del comentario
    comment_embedding = doc.vector
    
    # Agregar el vector de embeddings a la lista de embeddings
    embeddings.append(comment_embedding)

# Agregar la columna de embeddings al DataFrame
df.insert(1,'Embeddings', embeddings)

# Eliminar la columna de textos lematizados
try:
    df.drop('Lematized_comments', axis=1, inplace=True)
except:
    pass
df

Unnamed: 0,Comments_Length,Embeddings,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate,Sentiment
0,122,"[-0.08056078, 1.3073906, -2.4835172, 0.3724485...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,16,"[-2.1513762, 1.638841, -2.032583, 0.6298676, 0...",1,1,0,0,0,0,0,0,0,0,0,0,1,1
2,44,"[-0.8606621, 0.46393278, -2.3898466, 0.2655994...",1,1,0,0,1,0,0,0,0,0,0,0,1,0
3,52,"[-1.1302959, 0.4456414, -2.3458006, 1.073049, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,26,"[-0.8364995, 0.821457, -3.0169377, 1.0197995, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,"[-1.359565, -1.0031974, -0.6836875, 0.49616998...",0,0,0,0,0,0,0,0,0,0,0,0,0,1
996,29,"[-0.4008239, 1.793869, -2.035933, -0.13494176,...",1,0,0,0,0,1,1,0,0,0,0,0,1,0
997,9,"[1.40714, 0.27842566, -2.9605103, -0.19920565,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0
998,2,"[-0.27835, 0.138705, -0.25938, -0.75906503, 1....",0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [8]:
# Convertir la columna 'Embeddings' en múltiples columnas
embeddings_df = pd.DataFrame(df['Embeddings'].tolist())
df = pd.concat([df.drop('Embeddings', axis=1), embeddings_df], axis=1)

# Características (X)
X = df.drop(['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism', 'IsHate'], axis=1)

# Etiquetas (y)
y = df[['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism', 'IsHate']]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train

Unnamed: 0,Comments_Length,Sentiment,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
29,25,2,-0.078169,1.014904,-1.393897,-0.035557,1.031280,0.286419,0.761003,2.624081,...,0.552006,-1.550069,1.295612,-2.230747,-0.568653,-0.294217,1.183427,0.255979,-1.704437,0.785924
535,59,2,-1.374714,0.194537,-3.045763,0.552885,1.236490,0.108428,2.120135,3.419171,...,1.093528,0.555372,1.726862,-1.003955,-0.813593,0.159125,1.310806,0.673054,-0.674219,1.079446
695,14,2,-1.231247,-0.358191,-2.014098,-1.324058,0.888553,3.423360,0.427054,0.771433,...,2.285002,-1.242384,0.483298,-0.552555,-0.553989,1.465698,0.537958,-0.112247,0.577168,1.551037
557,13,0,-0.663475,-0.513808,-1.191180,0.836018,0.561985,0.947455,1.590333,1.111683,...,0.531587,-0.056449,1.530689,-1.533887,-0.096738,-1.304638,0.498335,0.716216,-1.576727,0.535085
836,38,2,-1.234304,0.039585,-3.410076,1.031312,1.344128,1.063540,1.226776,3.887391,...,1.368547,0.083345,1.301350,-1.069314,-1.700184,-0.363361,0.583715,0.907543,-2.196530,0.982788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,15,1,-1.931822,0.457506,-2.209784,-0.244438,1.157932,0.804820,1.356814,2.474962,...,1.390019,-0.309239,0.886086,-0.058718,-2.855517,-0.294201,1.304509,0.493398,-0.185753,1.242638
270,16,2,-0.245048,-0.002218,-1.674300,0.022123,1.357384,-0.912624,1.750457,2.916986,...,0.994220,0.342204,1.349367,-1.366764,-1.120326,-0.991794,1.318531,1.086853,-2.598847,0.403829
860,17,2,-1.372070,0.466309,-2.397424,0.511340,2.103409,1.913207,3.249717,4.357158,...,2.559596,-0.678911,0.666304,-1.616987,-1.122050,-2.702558,0.278644,2.782267,-3.668104,2.256698
435,6,2,-2.318475,-0.749345,-0.090025,1.702400,2.247475,1.743813,2.576125,4.529092,...,2.016175,-2.501285,-3.172735,-2.625903,3.370148,3.249022,0.037800,-0.801010,-3.290950,2.508061


In [10]:
X_test

Unnamed: 0,Comments_Length,Sentiment,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
521,48,2,-0.518586,1.159976,-0.588125,0.529787,1.364762,0.622201,2.073949,2.482665,...,2.666008,-0.825330,0.951545,-0.501470,-0.944496,-1.219877,0.068035,0.764104,-2.319700,0.566765
737,8,1,1.962267,-0.177755,-3.032966,-1.013122,-1.038313,2.055150,0.909230,2.212050,...,3.526833,-0.216915,-0.509150,-4.079967,-0.219257,-2.846092,2.137930,3.520075,-3.790213,1.217525
740,15,0,0.272004,0.786909,-2.709509,-0.584413,-1.537042,0.276787,0.372159,0.982304,...,1.932599,0.118164,1.700352,-1.943591,-1.532926,-1.276428,1.223555,1.762796,-1.672380,0.716660
660,24,1,-2.203237,0.405568,-0.769692,1.025198,2.350684,-0.073694,1.449777,1.434135,...,-0.726979,-0.246090,1.255982,-1.287283,-0.868956,-0.029247,0.138565,-0.512032,-0.440078,-0.737930
411,13,1,-1.139922,0.338053,0.936536,1.659465,-1.449928,0.952671,0.082413,0.467258,...,-1.233001,-1.208476,2.217565,2.428885,-0.383229,-1.374520,0.986041,-0.989730,1.592303,2.370215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,14,0,0.417874,0.456436,-2.769601,-1.788361,1.231889,0.759069,2.431257,2.630634,...,2.214206,-1.502594,0.625421,-2.427845,-1.713357,-2.374351,0.610758,1.002061,-4.196700,2.504651
332,18,2,-2.005347,-0.194305,-2.194956,0.045685,2.495613,0.896718,3.219953,3.406517,...,1.768550,-1.419464,1.698825,-2.001278,-2.731580,-0.655788,-0.457453,1.572262,-0.229156,1.472730
208,19,2,-1.276103,1.690252,-3.335120,0.009898,1.325665,1.929965,1.937541,2.304688,...,1.576046,-1.097818,0.126927,-0.779312,-0.711143,-0.681583,-0.408286,0.112151,-1.342880,1.994187
613,2,0,-1.795350,-2.787090,-1.293865,-1.752650,-1.708075,0.448450,0.091800,0.114200,...,-0.123060,0.425918,-0.087355,-1.415450,-0.620780,-0.382910,0.109200,5.222150,-3.404770,1.040920


In [11]:
y_train

Unnamed: 0,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
29,0,0,0,0,0,0,0,0,0,0,0,0,0
535,1,1,0,1,1,0,0,0,0,0,0,0,1
695,1,1,0,0,0,0,0,0,0,0,0,0,1
557,1,1,0,0,0,0,0,0,0,0,0,0,1
836,1,1,0,1,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0,0,0,0,0,0,0,0,0,0,0,0,0
270,1,1,0,0,0,0,0,0,0,0,0,0,1
860,1,0,0,0,0,1,1,0,0,0,0,0,1
435,1,1,0,1,0,0,0,0,0,0,0,0,1


In [12]:
y_test

Unnamed: 0,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism,IsHate
521,0,0,0,0,0,0,0,0,0,0,0,0,0
737,0,0,0,0,0,0,0,0,0,0,0,0,0
740,0,0,0,0,0,0,0,0,0,0,0,0,0
660,1,1,0,1,0,0,0,0,0,0,0,0,1
411,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0,0,0,0,0,0,0,0,0,0,0,0,0
332,1,1,0,0,0,0,0,0,0,0,0,0,1
208,0,0,0,0,0,0,0,0,0,0,0,0,0
613,1,1,0,0,0,0,0,0,0,0,0,0,1


In [13]:
# Grabamos los conjuntos de entrenamiento y de test
ruta = "train_test/"
X_train_name = ruta + "Embeddings_X_train.csv"
X_test_name = ruta + "Embeddings_X_test.csv"
y_train_name = ruta + "Embeddings_y_train.csv"
y_test_name = ruta + "Embeddings_y_test.csv"

X_train.to_csv(X_train_name)
X_test.to_csv(X_test_name)
y_train.to_csv(y_train_name)
y_test.to_csv(y_test_name)