In [None]:
#!pip install -r bert_requirements.txt

In [None]:
import torch
print(f'PyTorch version: {torch.__version__}')
print('*'*10)
print(f'_CUDA version: ')
!nvcc --version
print('*'*10)
print(f'CUDNN version: {torch.backends.cudnn.version()}')
print(f'Available GPU devices: {torch.cuda.device_count()}')
print(f'Device Name: {torch.cuda.get_device_name()}')

In [None]:
import numpy as np  # NumPy es una biblioteca utilizada para trabajar con matrices y realizar operaciones numéricas eficientes.
import pandas as pd  # Pandas es una biblioteca utilizada para la manipulación y análisis de datos, proporcionando estructuras de datos flexibles y herramientas para la limpieza, transformación y manipulación de datos.
import time  # El módulo time proporciona funciones para trabajar con el tiempo, como medir el tiempo de ejecución de un código.
import datetime  # El módulo datetime proporciona clases para trabajar con fechas y horas de una manera más conveniente.
import warnings  # El módulo warnings se utiliza para manejar advertencias y controlar su comportamiento en el código.
import torch  # PyTorch es un marco de aprendizaje automático de código abierto que proporciona herramientas para construir y entrenar modelos de aprendizaje profundo.
import torch.nn as nn  # El módulo torch.nn proporciona clases y funciones para construir redes neuronales en PyTorch.
from sklearn.model_selection import train_test_split  # train_test_split es una función que se utiliza para dividir los datos en conjuntos de entrenamiento y prueba.
from sklearn.metrics import classification_report  # classification_report es una función que muestra un informe detallado de las métricas de rendimiento de un modelo de clasificación.
import transformers  # Transformers es una biblioteca popular para el procesamiento de lenguaje natural (NLP) y el aprendizaje automático.
from transformers import AutoModel, BertTokenizerFast  # AutoModel es una clase que proporciona un modelo de lenguaje preentrenado, y BertTokenizerFast es una clase utilizada para tokenizar texto en BERT.
import pickle  # El módulo pickle se utiliza para la serialización y deserialización de objetos en Python, lo que permite guardar y cargar modelos o datos en disco.
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler  # Estas clases se utilizan para crear conjuntos de datos, cargar datos en modelos de PyTorch y generar muestras aleatorias o secuenciales para el entrenamiento.
from transformers import AdamW  # AdamW es un optimizador utilizado para ajustar los pesos de los modelos de Transformers durante el entrenamiento.
from sklearn.utils.class_weight import compute_class_weight  # compute_class_weight es una función que calcula los pesos de clase para abordar el desequilibrio de clases en el aprendizaje automático.
from tqdm.auto import tqdm  # tqdm es una biblioteca utilizada para crear barras de progreso interactivas en bucles y tareas largas en Python.

warnings.filterwarnings('ignore')

# specify GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
torch.cuda.is_available()

In [None]:
#df = pd.read_csv('./Sentiment/training.1600000.processed.noemoticon.csv',delimiter=',', encoding='ISO-8859-1')
df = pd.read_csv('./definitive_dataset.csv',delimiter=',', encoding='ISO-8859-1')

In [None]:
df.head()

In [None]:
#df.columns=['sentiment','id','date','query','username','text']
df.columns=['player', 'text', 'media', 'sentiment']

In [None]:
df.head()

### 1 - not ofensive
### 0 - ofensive

In [None]:
# This is done only in the case of the dataset training.1600000.processed.noemoticon.csv because the sentiment was 0 or 4,
# for BERT model it has to be 0 or 1. The dataset wich I'm using now it's already labeled with 0 and 1's.
#df.loc[df['sentiment'] == 4, 'sentiment'] = 1

In [None]:
df['sentiment'].value_counts(normalize = True)

In [None]:
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['sentiment'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['sentiment'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [None]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train_text]

pd.Series(seq_len).hist(bins = 30)

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [None]:
#define a batch size
batch_size = 32 # 160000_dataset size 256 , 64 with model bert_trained3 acc 0,71  and with bert_trained2 acc 0,60 y algo

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        
        self.bert = bert 
        
        # dropout layer
        self.dropout = nn.Dropout(0.1)
      
        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)
      
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):
        
        #pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
      
        # apply softmax activation
        x = self.softmax(x)

        return x

In [None]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to CPU
model = model.to(device)

In [None]:
# define the optimizer
optimizer = AdamW(model.parameters(),lr = 1e-5) 

In [None]:
# compute the class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)

print("Class Weights:",class_weights)

In [None]:
print(train_labels.shape)

In [None]:
# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
total_epochs = 10 # total_epochs = 15 the bert_trianed2

In [None]:
# function to train the model
def train():
    
    model.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch

        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
      # predictions are in the form of (no. of batches, size of batch, no. of classes).
      # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
def format_time(seconds):
    # Convert seconds to hh:mm:ss format
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s))

In [None]:
# function for evaluating the model
def evaluate():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time())
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = model(sent_id, mask)
            
            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in tqdm(range(total_epochs), total=total_epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)

print(classification_report(test_y, preds))

In [None]:
# Save the model
torch.save(model, './bert_trained_3_32batch')

# Load the model
model = torch.load('./bert_trained_3_32batch')

In [None]:
pepe_df = df.copy()[:60]

train_text, temp_text, train_labels, temp_labels = train_test_split(pepe_df['text'], pepe_df['sentiment'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=pepe_df['sentiment'])

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

print(tokens_train['attention_mask'])

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

for step, batch in enumerate(train_dataloader):
    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch
    model.zero_grad()
    with torch.no_grad():
        decoded_sent = tokenizer.decode(sent_id[0], skip_special_tokens=True)
        print(decoded_sent)
        
        # model predictions
        preds = model(sent_id, mask)
        
        prob = torch.softmax(preds, dim=1)[0]

        if prob[0] > prob[1]:
            print("Negative")
        else:
            print("Positive")
            
        preds = preds.detach().cpu().numpy()

In [None]:
def predict_offensive(text, model):
    tokenized_text = tokenizer.batch_encode_plus([text], max_length = 25,
                                                 truncation=True)
    
    sent_id = torch.tensor(tokenized_text['input_ids'])
    mask = torch.tensor(tokenized_text['attention_mask'])
    
    with torch.no_grad():
        initial_text = tokenizer.decode(sent_id[0], skip_special_tokens=True)
    
    # Assuring that all 3 elements are in the GPU
    model = model.to(device)
    mask = mask.to(device)
    sent_id = sent_id.to(device)

    # model predictions
    pred = model(sent_id, mask)
    prob = torch.softmax(pred, dim=1)[0]

    if prob[0] > prob[1]:
        sentiment = "Offensive"
    else:
        sentiment = "Non-Offensive"
        
    pred = pred.detach().cpu().numpy()

    return {initial_text:sentiment}

In [None]:
# For the sentence I kill you it is predicting a non-offensive comment. This is wrong maybe due to BERT model has been trained
# on longer sequences and shorter sequences may not provide enough context. Altought adding the param pad_to_max_length=True,
# the sentence is still to short and it continues missing the prediction
predict_offensive("I kill you", model)

In [None]:
predict_offensive("I don't like you", model)