## 1. Importing Libraries


In [29]:
import string
import pandas as pd 
import tensorflow as tf

from keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import LabelEncoder
import keras


## 2. Loading and Exploring the Data

In [30]:
test = pd.read_csv('data/cyberbullying_test.csv')
train = pd.read_csv('data/cyberbullying_train.csv')
val = pd.read_csv('data/cyberbullying_val.csv')

In [31]:
test['target'] = 'test'
train['target'] = 'train'
val['target'] = 'val'

In [32]:
df = pd.concat([test,train,val],axis=0).reset_index(drop=True)

## 3. Data Cleaning and Preparation

In [33]:
punc= string.punctuation

df['tweet_text'] = df['tweet_text'].str.lower().str.strip().replace(f'[{punc}]','',regex=True)

In [34]:
df = df[df['tweet_text'] != '']

In [35]:
lblencoder = LabelEncoder()
df['cyberbullying_type'] = lblencoder.fit_transform(df['cyberbullying_type'])

## 4. Scaling or Normalizing Data


In [36]:
train = df[df['target'] == 'train'].drop(columns='target').reset_index(drop=True)
test = df[df['target'] == 'test'].drop(columns='target').reset_index(drop=True)
val = df[df['target'] == 'val'].drop(columns='target').reset_index(drop=True)

In [37]:
x_train = train['tweet_text'].values
y_train = train['cyberbullying_type'].values

x_test = test['tweet_text'].values
y_test = test['cyberbullying_type'].values

x_val = val['tweet_text'].values
y_val = val['cyberbullying_type'].values

In [38]:
# Crear datasets de TensorFlow
raw_train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
raw_test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
raw_val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val))

In [39]:
vocab_size = 30000

vectorize_layer = TextVectorization(
    standardize=None,
    max_tokens=vocab_size,
    output_mode='tf-idf'
)

# Obtener el texto sin etiquetas
text_ds = raw_train_ds.map(lambda x, y: x)

start = time.time()
vectorize_layer.adapt(text_ds)
print(f'Time for adapt is {time.time()-start:.4f}')

Time for adapt is 383.9579


In [40]:
batch_size = 16
raw_train_ds = raw_train_ds.shuffle(20000).batch(batch_size)
raw_val_ds = raw_val_ds.batch(batch_size)
raw_test_ds = raw_test_ds.batch(batch_size)

In [41]:
for inp, target in raw_train_ds.take(1):
    print(inp[:2], target[:2])
    
vectorize_layer(inp)

tf.Tensor(
[b'thankyou mr president we feel blessed  proud to hv u as our closest friend america is stronger amp safer under ur govt just like india is under narendra modi yes together we will beat the chinese virus amp the radical islamic terrorism god bless india amp america forever'
 b'yeah these cops probably did disrespect their parents bunch of school bullies probably they had to become a cop to fee empowered what losers'], shape=(2,), dtype=string) tf.Tensor([5 0], shape=(2,), dtype=int32)


<tf.Tensor: shape=(16, 30000), dtype=float32, numpy=
array([[ 0.       ,  2.467025 ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  1.3330971, ...,  0.       ,  0.       ,
         0.       ],
       [ 8.892809 ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       ...,
       [17.785618 ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 8.892809 ,  1.2335125,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [17.785618 ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ]], dtype=float32)>

In [42]:
def preprocess(x,y):
    x = vectorize_layer(x)  # Vectorizar el texto
    y = tf.one_hot(y, depth=6)  # Convertir las etiquetas a one-hot
    return x,y

train_ds = raw_train_ds.map(lambda x,y: preprocess(x,y))
test_ds = raw_test_ds.map(lambda x,y: preprocess(x,y))
val_ds = raw_val_ds.map(lambda x,y: preprocess(x,y))

In [43]:
next(iter(train_ds))

(<tf.Tensor: shape=(16, 30000), dtype=float32, numpy=
 array([[ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
          0.       ],
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
          0.       ],
        [53.356853 ,  0.       ,  0.       , ...,  0.       ,  0.       ,
          0.       ],
        ...,
        [ 0.       ,  0.       ,  1.3330971, ...,  0.       ,  0.       ,
          0.       ],
        [ 0.       ,  1.2335125,  1.3330971, ...,  0.       ,  0.       ,
          0.       ],
        [ 0.       ,  1.2335125,  1.3330971, ...,  0.       ,  0.       ,
          0.       ]], dtype=float32)>,
 <tf.Tensor: shape=(16, 6), dtype=float32, numpy=
 array([[0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 

## 5. Model Building and Training


In [44]:
@keras.saving.register_keras_serializable()
class Linear(keras.layers.Layer):
    def __init__(self, num_outputs, activation=None):
        super().__init__()
        self.num_outputs = num_outputs
        self.activation = keras.activations.get(activation)

    def build(self, input_shape):
        input_dim = input_shape[-1]
        self.w = self.add_weight(
            shape=[input_dim, self.num_outputs], 
            name="kernel", 
            regularizer=keras.regularizers.l2(0.01)  # Añadir regularización L2
        )
        self.b = self.add_weight(shape=[self.num_outputs], name="bias")


    def call(self, inputs):
        x = tf.matmul(inputs, self.w) + self.b  # Asegúrate de usar tf.matmul
        return self.activation(x)

In [45]:
@keras.saving.register_keras_serializable()
class Model(keras.Model):
    def __init__(self, activation, dropout_rate=0.7):  # Ajusta la tasa de dropout si es necesario
        super().__init__()
        self.l1 = Linear(64, activation)  # Reducir la primera capa a 64 neuronas
        self.dropout1 = keras.layers.Dropout(dropout_rate)  # Dropout después de la primera capa
        self.l2 = Linear(6, activation='softmax')  # Capa de salida para 6 clases

    def call(self, x):
        x = self.l1(x)
        x = self.dropout1(x)  # Aplicar Dropout
        return self.l2(x)


### relu - 0.001

In [46]:
model = Model('leaky_relu')

In [47]:
# Compilar el modelo
model.compile(
    loss=keras.losses.CategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.0005),  # Reducir la tasa de aprendizaje
    metrics=["accuracy"]
)


early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", 
    patience=5, 
    restore_best_weights=True
)

# Entrenar el modelo
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,  # Aumentar las épocas
    callbacks=[early_stopping],  # Añadir early stopping
)

Epoch 1/10
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 53ms/step - accuracy: 0.7460 - loss: 1.0596 - val_accuracy: 0.8320 - val_loss: 0.8245
Epoch 2/10
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 54ms/step - accuracy: 0.8461 - loss: 0.7890 - val_accuracy: 0.8235 - val_loss: 0.8272
Epoch 3/10
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 54ms/step - accuracy: 0.8457 - loss: 0.7930 - val_accuracy: 0.8260 - val_loss: 0.8044
Epoch 4/10
[1m1426/2087[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m32s[0m 49ms/step - accuracy: 0.8457 - loss: 0.7734

In [22]:
# Guardar los pesos del mejor modelo
model.save_weights("best_model.weights.h5")  # Guarda solo los pesos

## 6. Model Evaluation

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Cargar solo los pesos del mejor modelo
loaded_model = Model(activation='leaky_relu')  # Crear una nueva instancia del modelo
loaded_model.load_weights("best_model.weights.h5")  # Cargar los pesos

# Compilar el modelo antes de evaluarlo
loaded_model.compile(
    loss=keras.losses.CategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.01),
    metrics=["accuracy"]
)

# Evaluar el modelo en el conjunto de prueba
test_loss, test_accuracy = loaded_model.evaluate(test_ds)
print(f"Loss en test: {test_loss}, Accuracy en test: {test_accuracy}")

In [None]:
# Obtener las predicciones del conjunto de prueba
predictions = loaded_model.predict(test_ds)

# Convertir las predicciones a clases
predicted_classes = np.argmax(predictions, axis=1)

# Obtener las etiquetas verdaderas
true_classes = np.concatenate([y for _, y in test_ds], axis=0)  # Asegúrate de que test_ds contenga los labels
true_classes = np.argmax(true_classes, axis=1)

In [None]:
# Crear la matriz de confusión
cm = confusion_matrix(true_classes, predicted_classes)

# Mostrar la matriz de confusión
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Matriz de Confusión")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test['cyberbullying_type'],predicted_classes)

In [None]:
import matplotlib.pyplot as plt

# Graficar pérdida
plt.figure(figsize=(12, 4))

# Pérdida
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Pérdida de Entrenamiento')
plt.plot(history.history['val_loss'], label='Pérdida de Validación')
plt.title('Pérdida durante el Entrenamiento')
plt.xlabel('Épocas')
plt.ylabel('Pérdida')
plt.legend()

# Precisión
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Precisión de Entrenamiento')
plt.plot(history.history['val_accuracy'], label='Precisión de Validación')
plt.title('Precisión durante el Entrenamiento')
plt.xlabel('Épocas')
plt.ylabel('Precisión')
plt.legend()

plt.tight_layout()
plt.show()
