### BERT

In [53]:
from tqdm import tqdm
import json

In [54]:
def load_dataset(split={"train", "test", "dev"}, domain={"rest", "laptop"}):
    with open(f'asc/{domain}/{split}.json', 'r') as file:
        dataset = json.load(file)
    return list(dataset.values())

In [55]:
rest_train = load_dataset(split="train", domain="rest") + load_dataset(split="dev", domain = "rest")
rest_test = load_dataset(split="test", domain="rest")
laptop_train = load_dataset(split="train", domain="laptop") + load_dataset(split="dev", domain = "laptop")
laptop_test = load_dataset(split="test", domain="laptop")

In [56]:
clean_dataset(rest_train)
clean_dataset(rest_test)
clean_dataset(laptop_train)
clean_dataset(laptop_test)

Processing Text: 100%|██████████████████████████████████████████████████████████| 3602/3602 [00:00<00:00, 119245.15it/s]
Processing Text: 100%|██████████████████████████████████████████████████████████| 1120/1120 [00:00<00:00, 127819.45it/s]
Processing Text: 100%|██████████████████████████████████████████████████████████| 2313/2313 [00:00<00:00, 111792.04it/s]
Processing Text: 100%|████████████████████████████████████████████████████████████| 638/638 [00:00<00:00, 127712.78it/s]


In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [59]:
rest_train_sentences = [" ".join(item['sentence']) for item in rest_train]
rest_train_polarities = [item['polarity'] for item in rest_train]

rest_test_sentences = [" ".join(item['sentence']) for item in rest_test]
rest_test_polarities = [item['polarity'] for item in rest_test]


laptop_train_sentences = [" ".join(item['sentence']) for item in laptop_train]
laptop_train_polarities = [item['polarity'] for item in laptop_train]

laptop_test_sentences = [" ".join(item['sentence']) for item in laptop_test]
laptop_test_polarities = [item['polarity'] for item in laptop_test]

To get results with the different subsets (e.g. laptop of restaurant dataset) simply remove the unwanted subset from the cell below/add them.

In [65]:
train_sentences = rest_train_sentences #+ laptop_train_polarities
train_polarities = rest_train_polarities #+ laptop_train_polarities

test_sentences = rest_test_sentences #+ laptop_test_polarities
test_polarities = rest_test_polarities #+ laptop_test_polarities

In [66]:
pos = 0
neg = 0
neu = 0
for polarity in rest_train_polarities:
    if polarity == 'positive':
        pos += 1
    elif polarity == 'negative':
        neg += 1
    elif polarity == 'neutral':
        neu += 1
print(f"Positive : {pos}")
print(f"Negative : {neg}")
print(f"Neutral : {neu}")

Positive : 2164
Negative : 805
Neutral : 633


In [67]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_polarities)
y_test = label_encoder.transform(test_polarities)

In [69]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_sentences(sentences, max_length=512):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,                      # Sentence to encode
                            add_special_tokens=True,       # Add '[CLS]' and '[SEP]'
                            max_length=max_length,         # Pad & truncate all sentences
                            padding='max_length',          # Pad all sentences to max length
                            truncation=True,               # Explicitly truncate to max length
                            return_attention_mask=True,    # Construct attention masks
                            return_tensors='pt',           # Return pytorch tensors
                        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Encode the sentences (X_train and X_test)
train_inputs, train_masks = encode_sentences(train_sentences)
test_inputs, test_masks = encode_sentences(test_sentences)

In [70]:
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

In [71]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16  # Adjust this according to your GPU capacity

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [77]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab
    num_labels=3,        # Number of output labels (3 for positive/negative/neutral)
    output_attentions=False,
    output_hidden_states=False,
)

# Tell the model to run on GPU
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [78]:
from transformers import get_linear_schedule_with_warmup
import torch
# Implement the training loop
epochs = 5
# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [79]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import numpy as np
lowest_loss = float('inf')
for epoch in range(0, epochs):
    # Training step
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_dataloader, desc="Epoch {:1d}".format(epoch+1), leave=False, disable=False)
    for batch in progress_bar:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear previously calculated gradients
        model.zero_grad()        

        # Perform a forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Perform a backward pass
        loss.backward()

        # Update parameters and take a step using the computed gradient
        optimizer.step()
        scheduler.step()
        # Update the progress bar
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
    # Calculate the average loss over the training data.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    print(f"  Average training loss: {avg_train_loss:.2f}")
    model.eval()

    # Initialize variables to gather predictions and true labels
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        total_eval_loss = 0
        for batch in tqdm(test_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

            loss = outputs.loss
            total_eval_loss += loss.item()

            logits = outputs.logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Store predictions and true labels
            all_predictions.extend(np.argmax(logits, axis=1).flatten())
            all_true_labels.extend(label_ids.flatten())
    precision, recall, f1_score, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average='weighted')
    accuracy = accuracy_score(all_true_labels, all_predictions)
    conf_matrix = confusion_matrix(all_true_labels, all_predictions)
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1_score:.4f}')
    print('Confusion Matrix:\n', conf_matrix)
    
    if avg_train_loss < lowest_loss:
        print(f"  Loss decreased from {lowest_loss:.2f} to {avg_train_loss:.2f}, saving model.")
        lowest_loss = avg_train_loss
        best_model_state = model.state_dict()
#torch.save(best_model_state, 'best_BERT_model.bin')

                                                                                                                        

  Average training loss: 0.95


100%|███████████████████████████████████████████████████████████████████████████████████| 70/70 [00:08<00:00,  8.23it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6500
Precision: 0.4225
Recall: 0.6500
F1-Score: 0.5121
Confusion Matrix:
 [[  0   0 196]
 [  0   0 196]
 [  0   0 728]]
  Loss decreased from inf to 0.95, saving model.


                                                                                                                        

  Average training loss: 0.95


100%|███████████████████████████████████████████████████████████████████████████████████| 70/70 [00:08<00:00,  8.26it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6500
Precision: 0.4225
Recall: 0.6500
F1-Score: 0.5121
Confusion Matrix:
 [[  0   0 196]
 [  0   0 196]
 [  0   0 728]]
  Loss decreased from 0.95 to 0.95, saving model.


                                                                                                                        

  Average training loss: 0.95


100%|███████████████████████████████████████████████████████████████████████████████████| 70/70 [00:08<00:00,  8.25it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6500
Precision: 0.4225
Recall: 0.6500
F1-Score: 0.5121
Confusion Matrix:
 [[  0   0 196]
 [  0   0 196]
 [  0   0 728]]
  Loss decreased from 0.95 to 0.95, saving model.


                                                                                                                        

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import numpy as np
model.eval()

# Initialize variables to gather predictions and true labels
all_predictions = []
all_true_labels = []

with torch.no_grad():
    total_eval_loss = 0
    for batch in tqdm(test_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    
        loss = outputs.loss
        total_eval_loss += loss.item()
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Store predictions and true labels
        all_predictions.extend(np.argmax(logits, axis=1).flatten())
        all_true_labels.extend(label_ids.flatten())
precision, recall, f1_score, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average='weighted')
accuracy = accuracy_score(all_true_labels, all_predictions)
conf_matrix = confusion_matrix(all_true_labels, all_predictions)
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1_score:.4f}')
print('Confusion Matrix:\n', conf_matrix)