In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler
import torch

In [67]:
data = pd.read_csv('data.csv', header = None)
data.columns=['Sentiment', 'Review']
data['Sentiment'] = data['Sentiment'].replace(-1, 0)


# Train-Test Split

In [81]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Load the pre-trained BERT model and tokenizer

In [17]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define the input format for BERT

In [75]:
def encode_data(data):
    input_ids = []
    attention_masks = []
    labels = []
    for review, label in zip(data['Review'], data['Sentiment']):
        # Encode the review and add the special tokens
        encoded = tokenizer.encode_plus(review, add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt')
        # Add the encoded review, attention mask, and label to the lists
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(label)
    # Convert the lists to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    # Return tensors
    return input_ids, attention_masks, labels


# Define the hyperparameters

In [23]:
batch_size = 32
num_epochs = 4
learning_rate = 2e-5

# Define the data loaders

In [None]:
train_inputs, train_masks, train_labels = encode_data(train_data)

In [59]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=2)

In [76]:
test_inputs, test_masks, test_labels = encode_data(test_data)

In [77]:
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=4)

# Define the optimizer

In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Move the model to the device (CPU or GPU)

In [82]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cpu


In [57]:
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()
    # Initialize the training loss and accuracy
    train_loss = 0.0
    train_acc = 0.0
    # Loop over the training batches
    for batch in train_dataloader:
        print("Hello")
        # Move the batch to the device
        for i in range(len(batch)):
            batch[i] = batch[i].to(device)
        # Get the input ids, attention masks, and labels
        input_ids = batch[0]
        attention_mask = batch[1]
        labels = batch[2]
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # Get the loss and logits
        loss = outputs[0]
        print(f'loss', loss)
        logits = outputs[1]
        print(f'logits', logits)
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Update the training loss and accuracy
        train_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        train_acc += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
    # Compute the average training loss and accuracy
    train_loss = train_loss / len(train_dataloader)
    train_acc = train_acc / len(train_dataloader)
    print(f'Batch train_loss', train_loss)
    print(f'Batch test_loss', train_acc)
    # Print the training results
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')


Hello
loss tensor(0.1804, grad_fn=<NllLossBackward0>)
logits tensor([[-0.7116,  0.8834],
        [-0.8951,  0.7527]], grad_fn=<AddmmBackward0>)
Hello
loss tensor(0.1925, grad_fn=<NllLossBackward0>)
logits tensor([[-1.1550,  0.9251],
        [ 0.1786, -1.0041]], grad_fn=<AddmmBackward0>)
Hello
loss tensor(0.2919, grad_fn=<NllLossBackward0>)
logits tensor([[ 0.7305, -0.6198],
        [ 0.1988, -0.6599]], grad_fn=<AddmmBackward0>)
Hello
loss tensor(0.1863, grad_fn=<NllLossBackward0>)
logits tensor([[-1.2014,  0.6678],
        [ 0.5991, -0.7577]], grad_fn=<AddmmBackward0>)
Hello
loss tensor(0.2466, grad_fn=<NllLossBackward0>)
logits tensor([[-0.9266,  0.5439],
        [-0.6145,  0.4892]], grad_fn=<AddmmBackward0>)
Hello
loss tensor(0.2982, grad_fn=<NllLossBackward0>)
logits tensor([[ 0.1461, -0.5785],
        [-0.9429,  0.5575]], grad_fn=<AddmmBackward0>)
Hello
loss tensor(0.2692, grad_fn=<NllLossBackward0>)
logits tensor([[ 0.2746, -0.9984],
        [-0.7728,  0.3104]], grad_fn=<AddmmBack

# Model Evaluation

In [80]:
model.eval()
# Initialize the test loss and accuracy
test_loss = 0.0
test_acc = 0.0
# Loop over the test batches
for batch in val_dataloader:
    # Move the batch to the device
    for i in range(len(batch)):
            batch[i] = batch[i].to(device)
    # Get the input ids, attention masks, and labels
    input_ids = batch[0]
    attention_mask = batch[1]
    labels = batch[2]
    # Forward pass with no gradient
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    # Get the loss and logits
    loss = outputs[0]
    logits = outputs[1]
    # Update the test loss and accuracy
    test_loss += loss.item()
    preds = torch.argmax(logits, dim=1)
    test_acc += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
# Compute the average test loss and accuracy
test_loss = test_loss / len(val_dataloader)
test_acc = test_acc / len(val_dataloader)
# Print the test results
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')

Test Loss: 0.3109, Test Accuracy: 0.8500
