## Sentiment classification

In [24]:
# Starting by importing useful libraries
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
# Make a seeding function for reproducibility
def seeder(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seeder()

In [3]:
torch.cuda.empty_cache()

In [4]:
if torch.cuda.is_available():
    print("GPU available!")
else:
    print("GPU not available")

GPU available!


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
tokeniser = BertTokenizer.from_pretrained('bert-base-uncased')
seeder()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_df = pd.read_csv('Train.csv')
val_df = pd.read_csv('Valid.csv')
test_df = pd.read_csv('Test.csv')

In [8]:
train_df

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [9]:
def preprocess(text):
    return tokeniser.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

In [10]:
def filter_long_data(df, max_length=256):
    def is_within(text):
        tokens = tokeniser.encode(text, add_special_tokens=True)
        return len(tokens) <= max_length
    
    df_filt = df[df['text'].apply(is_within)]
    return df_filt

In [11]:
# token_lengths = train_df['text'].apply(lambda x: len(tokeniser.encode(x, add_special_tokens=True)))

# plt.hist(token_lengths, bins=50)
# plt.xlabel('Token Lengths')
# plt.ylabel('Number of Samples')
# plt.title('Distribution of Token Lengths')
# plt.show()

In [12]:
train_df = filter_long_data(train_df)
val_df = filter_long_data(val_df)
test_df = filter_long_data(test_df)

Token indices sequence length is longer than the specified maximum sequence length for this model (1384 > 512). Running this sequence through the model will result in indexing errors


In [13]:
class SentimentDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        label = self.df.iloc[idx]['label']
        encoded = preprocess(text)
        input_ids = encoded['input_ids'].squeeze()
        attention_mask = encoded['attention_mask'].squeeze()
        label = torch.tensor(label, dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

In [14]:
seeder()
train_dataset = SentimentDataset(train_df)
val_dataset = SentimentDataset(val_df)
test_dataset = SentimentDataset(test_df)

In [15]:
print(train_dataset[0])

{'input_ids': tensor([  101,  1045,  3473,  2039,  1006,  1038,  1012,  3551,  1007,  3666,
         1998,  8295,  1996,  8505, 12887,  1012,  2035,  2026, 14711,  2012,
         2082,  3427,  1012,  2057,  2209,  1000,  8505, 12887,  1000,  2077,
         2082,  1010,  2076,  6265,  1998,  2044,  2082,  1012,  2057,  2035,
         2359,  2000,  2022, 17270,  2030,  3660,  1012,  2053,  2028,  2359,
         2000,  2022,  5070,  1012, 10320,  2091,  2013,  1019,  2150,  2019,
         2396,  2433,  1012,  1045,  2165,  2026,  2336,  2000,  2156,  1996,
         3185,  5327,  2027,  2052,  2131,  1037, 12185,  1997,  2054,  1045,
         3866,  2004,  1037,  2775,  1012,  2129, 19248, 15640,  1012,  1996,
         2069,  2152,  2391,  2001,  1996, 10245,  7685,  4323,  8694,  1012,
         2025,  2008,  2009,  2071, 12826,  2007,  1996,  2434,  3556,  1997,
         1996,  8505, 12887,  1012, 16047,  2220,  5095, 16956,  2028,  2547,
         3149,  2145,  3248,  2128, 15532,  2015, 

In [16]:
seeder()
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [17]:
def train(n_epochs, optimizer, model, train_loader, val_loader, device):

    training_losses = []
    validation_losses = []

    # Set to largest value
    best_val_loss = float('inf')

    # For early stopping
    patience_counter = 0
    patience = 2

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0

        # Compute training loss and update
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Zero gradients
            optimizer.zero_grad()
            # Backward pass and gradient descent
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        training_losses.append(avg_loss)

        # Compute validation loss
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        validation_losses.append(avg_val_loss)


        print(f'Epoch {epoch+1}/{n_epochs}, Training Loss: {avg_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss

            # Save best model state
            best_model = model.state_dict()
            patience_counter = 0 
        else:
            # Increment counter
            patience_counter += 1

        # Early stopping condition
        if patience_counter > patience:
            print(f"Stopping early at epoch {epoch+1} due to increasing validation loss.")
            break

    print('Training complete!')
    
    # Restore best model to the given model
    model.load_state_dict(best_model)
    return training_losses, validation_losses

In [18]:
opt = AdamW(model.parameters(), lr=0.001)
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [19]:
seeder()
n_epochs = 5
tl, vl = train(n_epochs, opt, model, train_loader, val_loader, device)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/5, Training Loss: 0.7234, Validation Loss: 0.6956
Epoch 2/5, Training Loss: 0.7220, Validation Loss: 0.6933
Epoch 3/5, Training Loss: 0.7116, Validation Loss: 0.7114
Epoch 4/5, Training Loss: 0.7115, Validation Loss: 0.6931
Epoch 5/5, Training Loss: 0.7038, Validation Loss: 0.6963
Training complete!


Let's evaluate the model:

In [27]:
def evaluate_model(model, dataloader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
        
    true_labels = np.array(true_labels)
    predictions = np.array(predictions)

    f1 = accuracy_score(true_labels, predictions)

    print(f'Accuracy: {f1:.4f}')

Evaluating validation data:

In [28]:
evaluate_model(model, val_loader, device)

Accuracy: 0.4960


Evaluating test data:

In [29]:
evaluate_model(model, test_loader, device)

Accuracy: 0.4838
