In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
#define the self attention class: the complete layer of multiple heads
#whatever was vertically dimensioned in the theory, is now horizontal.

In [3]:
class SelfAttention(nn.Module):
    def __init__(self, k, heads, device):
        super().__init__()
        assert k % heads == 0
        self.k, self.heads, self.device = k, heads, device

        self.toKeys = nn.Linear(k, k, bias=False).to(device)  # wk
        self.toQueries = nn.Linear(k, k, bias=False).to(device)  # wq
        self.toValues = nn.Linear(k, k, bias=False).to(device)  # wv
        self.unifyHeads = nn.Linear(k, k).to(device)  # to concatenate the resultant chunks of each attention head

    def forward(self, x):
        # Ensure the input tensor is on the correct device
        x = x.to(self.device)

        b, t, k = x.size()
        h = self.heads

        queries = self.toQueries(x)
        keys = self.toKeys(x)
        values = self.toValues(x)

        headSize = self.k // self.heads

        keys = keys.view(b, t, h, headSize).transpose(1, 2).contiguous().view(b * h, t, headSize)
        queries = queries.view(b, t, h, headSize).transpose(1, 2).contiguous().view(b * h, t, headSize)
        values = values.view(b, t, h, headSize).transpose(1, 2).contiguous().view(b * h, t, headSize)

        raw_weights = torch.bmm(queries, keys.transpose(1, 2))
        raw_weights /= headSize**(1/2)
        weights = F.softmax(raw_weights, dim=2)

        out = torch.bmm(weights, values).view(b, h, t, headSize)
        out = out.transpose(1, 2).contiguous().view(b, t, h * headSize)
        return self.unifyHeads(out)

In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, k, heads, device):
        super().__init__()

        self.attention = SelfAttention(k, heads=heads, device=device).to(device)

        self.norm1 = nn.LayerNorm(k).to(device)
        self.norm2 = nn.LayerNorm(k).to(device)

        self.ff = nn.Sequential(
            nn.Linear(k, 4 * k).to(device),
            nn.ReLU(),
            nn.Linear(4 * k, k).to(device)
        ).to(device)

    def forward(self, x):
        # Ensure the input tensor is on the correct device
        x = x.to(self.attention.device)

        attended = self.attention(x)
        x = self.norm1(attended + x)

        fedForward = self.ff(x)
        return self.norm2(fedForward + x)

In [5]:
class CTransformer(nn.Module):
    def __init__(self, k, heads, depth, seq_length, num_tokens, num_classes,device):
        super().__init__()
        self.device = device
        self.num_tokens = num_tokens  # Size of vocabulary
        self.token_emb = nn.Embedding(num_tokens, k).to(device)  # Map each token (integer) to a size k vector
        self.pos_emb = nn.Embedding(seq_length, k).to(device)  # Map each position (0->seq_length-1) to a size k vector

        # Transformer blocks
        tblocks = []
        for i in range(depth):
            tblocks.append(TransformerBlock(k, heads, device))
        self.tblocks = nn.Sequential(*tblocks).to(device)

        # Layer for handling output: project to an array of size num_classes
        self.toProbs = nn.Linear(k, num_classes).to(device)

    def forward(self, x):
        # Ensure the input tensor is on the correct device
        x = x.to(self.device)

        tokens = self.token_emb(x)
        b, t, k = tokens.size()

        positions = torch.arange(t, device=self.device)  # Ensure positions are on the correct device
        positions = self.pos_emb(positions)[None, :, :].expand(b, t, k)

        x = tokens + positions
        x = self.tblocks(x)

        x = x.mean(dim=1)  # Calculate mean over the second dimension (t)
        x = self.toProbs(x)  # Project to a shape (b, num_classes)
        return F.log_softmax(x, dim=1)  # Calculate log of softmax across the num_classes dimension

In [6]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from torch.utils.data import DataLoader, TensorDataset

In [9]:
import os
import torch
import torch.nn.functional as F
import pandas as pd

def handle_data(file_path,batch_size=10,seq_length=512):
      # Step 1: Read the data and filter out rows where 'example_very_unclear' is True
      df = pd.read_csv(file_path)
      df = df[df['example_very_unclear'] == False]

      while((len(df))%batch_size!=0):
        df=df[:-1]
      
      # shuffle
      df = df.sample(frac=1)

      # Step 2: Store the text and labels separately
      texts = df['text'].tolist()

      # Get the list of all emotion columns
      emotion_columns = df.columns[9:]

      # Step 3: Convert one-hot encoded labels to indices
      # The labels will be stored as a tensor with each label as an index
      labels = df[emotion_columns].values

      labels = torch.tensor(labels, dtype=torch.float32,device=device)

      # Convert one-hot encoded labels to indices (assuming one-hot encoding has exactly one '1' per row)
      # If multiple emotions are possible, choose one or discuss multi-label classification
      indices = labels.argmax(dim=1)

      # Now you have text_tensors and indices ready for model training
      #Convert into vectors
      text_tensors = tokenizer(
          texts,
          return_tensors="pt", #retuning tensors are compatible with pytorch
          padding=True,
          truncation=True,
          max_length=seq_length
      )
      total_size = len(df)
      comments = text_tensors['input_ids'].view(total_size,seq_length)
      indices = indices.view(total_size)
      return (comments.to(device),indices.to(device))

def train(model, epoch_completed, x_train, x_labels, epochs=20, learning_rate=0.0001, lr_warmup=10000, save_path="large2_{}.pth", validation_ratio=0.1,batch_size=64):
    model.train(True)
    opt = torch.optim.Adam(lr=learning_rate, params=model.parameters())
    # Adjusting the learning rate schedule to account for batch-level processing
    sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (lr_warmup / x_train.shape[0]), 1.0))

    # Split the data into training and validation sets
    val_batches = int(validation_ratio * x_train.shape[0])
    x_val, y_val = x_train[:val_batches], x_labels[:val_batches]
    x_train, x_labels = x_train[val_batches:], x_labels[val_batches:]

    # Create DataLoaders
    train_dataset = TensorDataset(torch.Tensor(x_train), torch.Tensor(x_labels).long())
    val_dataset = TensorDataset(torch.Tensor(x_val), torch.Tensor(y_val).long())

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(epoch_completed, epochs):
        model.train(True)
        cost = 0
        
        for batch_reviews, batch_labels in train_loader:
            opt.zero_grad()
            out = model(batch_reviews)  # Forward pass
            loss = F.nll_loss(out.view(-1, out.size(-1)), batch_labels.view(-1))  # Adjusted loss calculation
            cost += loss.item()
            loss.backward()
            opt.step()
            sch.step()  # Learning rate schedule per batch

        print(f"Epoch {epoch}, Training cost: {cost}")

        # Validation after each epoch
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():  # Disable gradient computation for validation
            for batch_reviews, batch_labels in val_loader:
                out = model(batch_reviews)
                predictions = torch.argmax(out, dim=-1)  # Get predictions
                correct += (predictions == batch_labels).sum().item()  # Count correct predictions
                total += batch_labels.numel()  # Total number of elements in the batch

        accuracy = correct / total
        print(f"Epoch {epoch}, Validation Accuracy: {accuracy}")

        # Save the model after each epoch
        torch.save(model.state_dict(), save_path.format(epoch))

def test(model,x_test,x_labels,batch_size=32):
    model.eval()
    train_dataset = TensorDataset(torch.Tensor(x_test), torch.Tensor(x_labels).long())
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    correct = 0
    with torch.no_grad():
        for batch_reviews,batch_labels in train_loader:
            out = model.forward(batch_reviews)
            for i,sentence in enumerate(out):
                if(torch.argmax(out[i])==batch_labels[i]):
                    correct+=1
    print(f"Accuracy: {(correct/(x_test.shape[0]*x_test.shape[1]))*100}%")

def inference(model, sentence,seq_length):
    #Convert into vectors
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    vocab_size = tokenizer.vocab_size
    encoded_sentence = tokenizer(
        sentence,
        return_tensors="pt", #retuning tensors are compatible with pytorch
        padding=True,
        truncation=True,
        max_length=seq_length
    )
    model.eval(True)
    with torch.no_grad():
      out = model.forward(encoded_sentence)
      return torch.argmax(out)



In [13]:
#handle data
file1_path="full_dataset/goemotions_1.csv"
embedding_dim = 768
batch_size=64
seq_length=16
# Reading data from the first file
comments1, labels1 = handle_data(file1_path, batch_size=batch_size, seq_length=seq_length)
file2_path="full_dataset/goemotions_2.csv"
# Reading data from the second file
comments2, labels2 = handle_data(file2_path, batch_size=batch_size, seq_length=seq_length)

# Concatenating the tensors from both files
train_comments = torch.cat((comments1, comments2), dim=0)
train_labels = torch.cat((labels1, labels2), dim=0)

In [14]:
#define the model
model = CTransformer(k=embedding_dim,heads=12,depth=16,seq_length=seq_length,num_tokens=vocab_size,num_classes=28,device=device).to(device)

In [15]:
# model.load_state_dict(torch.load("model_epoch_69.pth",map_location=device))
train(model, epoch_completed=0,x_train=train_comments,x_labels=train_labels,epochs=20,learning_rate=0.00002,lr_warmup=500,save_path="Goemotion_69.pth",validation_ratio=0.1,batch_size=batch_size)
torch.save(model.state_dict(), "Goemotion_69.pth")

Epoch 0, Training cost: 9624.941017389297
Epoch 0, Validation Accuracy: 0.3752541388324136
Epoch 1, Training cost: 8781.10103213787
Epoch 1, Validation Accuracy: 0.39042985768225386
Epoch 2, Training cost: 8334.506732702255
Epoch 2, Validation Accuracy: 0.39456869009584666


KeyboardInterrupt: 

In [13]:
model.load_state_dict(torch.load("Goemotion_69.pth",map_location=device))
test_file_path = "full_dataset/goemotions_3.csv"
test_comments,test_labels = handle_data(test_file_path,batch_size=64,seq_length=seq_length)
test(model,test_comments,test_labels)

  model.load_state_dict(torch.load("large3_7.pth",map_location=device))


ValueError: not enough values to unpack (expected 3, got 2)

# Future Scope
1. validation set
2. Prompt the user for hyper parameters.
3. host the model on a web app.