## 1. Importing Libraries


In [95]:
import string
import pandas as pd 
import tensorflow as tf

from keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import LabelEncoder
import keras


## 2. Loading and Exploring the Data

In [44]:
test = pd.read_csv('data/cyberbullying_test.csv')
train = pd.read_csv('data/cyberbullying_train.csv')
val = pd.read_csv('data/cyberbullying_val.csv')

In [45]:
test['target'] = 'test'
train['target'] = 'train'
val['target'] = 'val'

In [46]:
df = pd.concat([test,train,val],axis=0).reset_index(drop=True)

## 3. Data Cleaning and Preparation

In [47]:
punc= string.punctuation

df['tweet_text'] = df['tweet_text'].str.lower().str.strip().replace(f'[{punc}]','',regex=True)

In [48]:
df = df[df['tweet_text'] != '']

In [49]:
lblencoder = LabelEncoder()
df['cyberbullying_type'] = lblencoder.fit_transform(df['cyberbullying_type'])

## 4. Scaling or Normalizing Data


In [50]:
train = df[df['target'] == 'train'].drop(columns='target').reset_index(drop=True)
test = df[df['target'] == 'test'].drop(columns='target').reset_index(drop=True)
val = df[df['target'] == 'val'].drop(columns='target').reset_index(drop=True)

In [51]:
x_train = train['tweet_text'].values
y_train = train['cyberbullying_type'].values

x_test = test['tweet_text'].values
y_test = test['cyberbullying_type'].values

x_val = val['tweet_text'].values
y_val = val['cyberbullying_type'].values

In [52]:
# Crear datasets de TensorFlow
raw_train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
raw_test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))

In [None]:
vocab_size = 10000

vectorize_layer = TextVectorization(
    standardize=None,
    max_tokens=vocab_size,
    output_mode='tf-idf')

# Obtener el texto sin etiquetas
text_ds = raw_train_ds.map(lambda x, y: x)
start = time.time()
vectorize_layer.adapt(text_ds)
print(f'Time for adapt is {time.time()-start:.4f}')

In [54]:
batch_size = 16
raw_train_ds = raw_train_ds.shuffle(20000).batch(batch_size)
raw_test_ds = raw_test_ds.batch(batch_size)

In [None]:
for inp, target in raw_train_ds.take(1):
    print(inp[:2], target[:2])
    
vectorize_layer(inp)

In [97]:
def preprocess(x,y):
    x = vectorize_layer(x)  # Vectorizar el texto
    y = tf.one_hot(y, depth=6)  # Convertir las etiquetas a one-hot
    return x,y

train_ds = raw_train_ds.map(lambda x,y: preprocess(x,y))
test_ds = raw_test_ds.map(lambda x,y: preprocess(x,y))

In [None]:
next(iter(train_ds))

## 5. Model Building and Training


In [None]:
class Linear(keras.layers.Layer):
    def __init__(self, num_outputs, activation=None):
        super().__init__()
        self.num_outputs = num_outputs
        self.activation = keras.activations.get(activation)

    def build(self, input_shape):
        input_dim = input_shape[-1]
        self.w = self.add_weight(shape=[input_dim, self.num_outputs], name="kernel")
        self.b = self.add_weight(shape=[self.num_outputs], name="kernel")

        
    def call(self, inputs):
        x = keras.ops.matmul(inputs, self.w) + self.b
        return self.activation(x)

In [None]:
class Model(keras.Model):
    def __init__(self,activation):
        super().__init__()
        self.l1 = Linear(3,activation)
        self.l2 = Linear(1)

    def call(self, x):
        x = self.l1(x)
        x = self.l2(x)
        return x

In [None]:
model.compile(
    loss=keras.losses.MeanSquaredError(),
    optimizer=keras.optimizers.SGD(learning_rate=0.001),
    metrics=["accuracy"]
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
)

## 6. Model Evaluation