<a href="https://colab.research.google.com/github/kevilamorais/pln/blob/main/toxic_discourse_dl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
!pip install datasets
!pip install emoji

!python -m spacy download pt_core_news_sm 

In [None]:
import spacy
import re
import numpy as np
import matplotlib.pyplot as plt
import emoji
import os

from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint
from datasets import load_dataset
from unicodedata import normalize
from tqdm import tqdm
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

## Dataset: https://huggingface.co/datasets/told-br

In [None]:
dataset = load_dataset('told-br')

dataset

In [None]:
n_rows = 1000

tweets_train = dataset['train']['text']
tweets_validation = dataset['validation']['text']
tweets_test = dataset['test']['text']

labels_train = dataset['train']['label']
labels_validation = dataset['validation']['label']
labels_test = dataset['test']['label']

if n_rows > 0:
  tweets_train = tweets_train[:n_rows]
  tweets_validation = tweets_validation[:n_rows]
  tweets_test = tweets_test[:n_rows]

  labels_train = labels_train[:n_rows]
  labels_validation = labels_validation[:n_rows]
  labels_test = labels_test[:n_rows]

print(f'\nTrain: {len(tweets_train)}')
print(f'Validation: {len(tweets_validation)}')
print(f'Test: {len(tweets_test)}')

print(f'\n\nLabels Distribution Train: {Counter(labels_train)}')
print(f'Labels Distribution Validation: {Counter(labels_validation)}')
print(f'Labels Distribution Test: {Counter(labels_test)}')

In [None]:
print(f'Raw Tweet Train: {tweets_train[0]}')
print(f'\nRaw Tweet Validation: {tweets_validation[0]}')
print(f'\nRaw Tweet Test: {tweets_test[0]}')

In [None]:
def preprocessar_tweets(tweets):
    nlp = spacy.load('pt_core_news_sm')
    repetion_pattern = re.compile(r'(.)\1\1+')
    new_tweets = []
    with tqdm(total=len(tweets), colour='green', desc='Processando') as pbar:
      for tweet in tweets:
          tweet = emoji.demojize(tweet, language='pt')
          tweet = tweet.replace('_', ' ')
          tweet = normalize('NFKD', tweet).encode('ASCII', 'ignore').decode('ASCII')
          tweet = repetion_pattern.sub(r'\1', tweet)
          tweet = re.sub(r'https?://\w+', '', tweet)
          tweet = re.sub(r'@\w+', ' ', tweet)
          tweet = re.sub(r'\s\s+', ' ', tweet)
          doc = nlp(tweet)
          tokens = [t.lemma_.lower() for t in doc if t.pos_ != 'PUNCT' and \
                    not t.is_stop and len(t.lemma_) > 1]
          new_tweet = ' '.join(tokens)
          new_tweets.append(new_tweet.strip())
          pbar.update(1)
    return new_tweets

In [None]:
tweets_train = preprocessar_tweets(tweets_train)
tweets_validation = preprocessar_tweets(tweets_validation)
tweets_test = preprocessar_tweets(tweets_test)

print(f'\n\nPreprocessed Tweet Train: {tweets_train[0]}')
print(f'Preprocessed Tweet Validation: {tweets_validation[0]}')
print(f'Preprocessed Tweet Test: {tweets_test[0]}')

In [None]:
tokenizer = Tokenizer(oov_token='<oov>')

tokenizer.fit_on_texts(tweets_train)

X_train = tokenizer.texts_to_sequences(tweets_train)
X_val = tokenizer.texts_to_sequences(tweets_validation)
X_test = tokenizer.texts_to_sequences(tweets_test)

print(f'\n\nSeq. Tweet Train: {X_train[0]}')
print(f'Seq. Tweet Validation: {X_val[0]}')
print(f'Seq. Tweet Test: {X_test[0]}')

In [None]:
max_len = max([len(x) for x in X_train])

print(max_len)

X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
X_val = pad_sequences(X_val, maxlen=max_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

print(f'\n\nSeq. Padded Tweet Train: {X_train[0]}')
print(f'Seq. Padded Tweet Validation: {X_val[0]}')
print(f'Seq. Padded Tweet Test: {X_test[0]}')

In [None]:
def build_simple_model(max_len, num_classes):
  model = models.Sequential()
  model.add(layers.Dense(units=256, input_shape=(max_len,), activation='relu'))
  model.add(layers.Dense(units=num_classes, activation='sigmoid'))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

def build_simple_model_embedding(max_len, vocab_size, embedding_dim, num_classes):
   model = models.Sequential()
   model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, \
                              input_length=max_len))
   model.add(layers.Flatten())
   model.add(layers.Dense(units=64, activation='relu'))
   model.add(layers.Dense(units=num_classes, activation='sigmoid'))
   model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
   return model

In [None]:
vocab_size = len(tokenizer.word_index) + 1
num_classes = 2

print('\nVocab size:', vocab_size)

y_train = to_categorical(labels_train, num_classes=num_classes)
y_val = to_categorical(labels_validation, num_classes=num_classes)

print('\nTrain Labels:', labels_train[0], '-', y_train[0])
print('Validation Labels:', labels_validation[0], '-', y_val[0])

In [None]:
model_option = 2

if model_option == 1:
  model = build_simple_model(max_len, num_classes)
elif model_option == 2:
  model = build_simple_model_embedding(max_len, vocab_size, embedding_dim=100, \
                                       num_classes=num_classes)
elif model_option == 3:
  pass
elif model_option == 4:
  pass

model.summary()

In [None]:
checkpoint_dir = '/content/model_checkpoint/'

os.makedirs(checkpoint_dir, exist_ok=True)

model_checkpoint = ModelCheckpoint(filepath=checkpoint_dir, \
                                   save_weights_only=True, monitor='val_accuracy', \
                                   mode='max', save_best_only=True)

history = model.fit(X_train, y_train, batch_size=128, epochs=10, \
                    validation_data=(X_val, y_val), callbacks=[model_checkpoint])

model.load_weights(checkpoint_dir)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['train', 'validation'], loc='upper left')

plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['train', 'validation'], loc='upper left')

plt.show()

In [None]:
y_pred = model.predict(X_test)

y_pred = np.argmax(y_pred, axis=1)

In [None]:
report = classification_report(labels_test, y_pred, zero_division=0)

print(report)

In [None]:
ConfusionMatrixDisplay.from_predictions(labels_test, y_pred)

plt.show()