<a href="https://colab.research.google.com/github/kgreed4/no_hate_transformer/blob/kgreed/bert_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from transformers import BertForSequenceClassification
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
df = pd.read_csv('cleaned_data_sw.csv')

# Drop every column that isn't tweet or class
df = df.drop(df.columns.difference(['tweet', 'class']), axis=1)

# First, we want to use the BERT tokenizer to tokenize and encode the dataset into embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# We will create a small function that will take care of tokenization and encoding
def encode_texts(tokenizer, texts, max_length):
    encoding = tokenizer.batch_encode_plus(
        texts,
        # This is required to add special tokens such as the [CLS] and [SEP] tokens that indicate the start and end of a sentence
        add_special_tokens=True,
        # Here the padding variable is responsible for padding the sequences to the same length
        padding='max_length',
        # The max length of the tokenized sequences
        max_length=max_length,
        return_attention_mask=True,
        # Here we specify that we want the output to be TensorFlow tensors
        return_tensors='tf',
        # If the sequence is longer than max_length, it will be truncated to a fixed length
        truncation=True
    )
    # The encoding['input_ids'] contains the tokenized sequences
    # The encoding['attention_mask'] contains the attention masks and tells the model which tokens to pay attention to and which ones to ignore (mask token)
    return encoding['input_ids'], encoding['attention_mask']

# Here we define the maximum length (randomly chosen per ChatGPT's recommendation)
max_length = 128

# We can then call the function to tokenize and encode the dataset
input_ids, attention_masks = encode_texts(tokenizer, df['tweet'].tolist(), max_length)

# Here we create labels from the 'class' column
# This is the target variable that we want to predict
labels = tf.convert_to_tensor(df['class'].values, dtype=tf.int32).numpy()

# For some reason, I was getting an error saying that I needed to convert to NumPy arrays instead of TensorFlow tensors
# So I converted the input_ids and attention_masks to NumPy arrays
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()

# We can then use the train_test_split function from scikit-learn to split the dataset into training and validation sets
train_inputs_np, validation_inputs_np, train_labels_np, validation_labels_np = train_test_split(input_ids_np, labels, random_state=2021, test_size=0.1)
train_masks_np, validation_masks_np, _, _ = train_test_split(attention_masks_np, labels, random_state=2021, test_size=0.1)

# Here the BUFFER_SIZE is the number of training inputs
BUFFER_SIZE = len(train_inputs_np)
# The batch size is 32
BATCH_SIZE = 32

# We can then take the NumPy arrays and convert them to TensorFlow datasets for both the training and validation sets
train_dataset = tf.data.Dataset.from_tensor_slices(((train_inputs_np, train_masks_np), train_labels_np)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
validation_dataset = tf.data.Dataset.from_tensor_slices(((validation_inputs_np, validation_masks_np), validation_labels_np)).batch(BATCH_SIZE)

In [28]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_bert_sequence_classification(train_dataset, validation_dataset, num_epochs=2):
    # Initialize the BERT model for sequence classification
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

    # Define the optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)

    # Define the loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # Compile the model
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

    # Train the model
    model.fit(train_dataset, epochs=num_epochs, validation_data=validation_dataset)

    # Evaluate the model on training dataset
    train_loss, train_accuracy = model.evaluate(train_dataset)
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Training Loss: {train_loss}")

# Train the BERT model
train_bert_sequence_classification(train_dataset, validation_dataset)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2
Training Accuracy: 0.8331689238548279
Training Loss: 0.45084381103515625
