In [1]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [2]:
import numpy as np
import pandas as pd
import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel


In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

## Layer Norm

In [4]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

## Gelu Activation

In [5]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

## FFN

In [6]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), ## Expansion
            GELU(), ## Activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), ## Contraction
        )

    def forward(self, x):
        return self.layers(x)

## Transformer block

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        # 2*4*768
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

## GPT Model

In [8]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

# Load pretrained model weights

In [9]:
import numpy as np
import torch
from transformers import GPT2LMHeadModel
foundational_model = GPT2LMHeadModel.from_pretrained("gpt2")
state_dict = foundational_model.state_dict()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))
def load_weights(n_layers,gpt2):
    gpt2.tok_emb.weight = assign(gpt2.tok_emb.weight, (state_dict['transformer.wte.weight']))
    gpt2.pos_emb.weight = assign(gpt2.pos_emb.weight, (state_dict['transformer.wpe.weight']))
    for i in range(n_layers):
        gpt2.trf_blocks[i].norm1.scale = assign(gpt2.trf_blocks[i].norm1.scale,(state_dict[f'transformer.h.{i}.ln_1.weight']))
        gpt2.trf_blocks[i].norm1.shift = assign(gpt2.trf_blocks[i].norm1.shift,(state_dict[f'transformer.h.{i}.ln_1.bias']))

        q_w, k_w, v_w = np.split(state_dict[f'transformer.h.{i}.attn.c_attn.weight'],3,axis=-1)
        q_b, k_b, v_b = np.split(state_dict[f'transformer.h.{i}.attn.c_attn.bias'],3,axis=-1)
        gpt2.trf_blocks[i].att.W_query.weight = assign(gpt2.trf_blocks[i].att.W_query.weight,(q_w.T))
        gpt2.trf_blocks[i].att.W_key.weight = assign(gpt2.trf_blocks[i].att.W_key.weight, (k_w.T))
        gpt2.trf_blocks[i].att.W_value.weight = assign(gpt2.trf_blocks[i].att.W_value.weight, (v_w.T))
        gpt2.trf_blocks[i].att.W_query.bias = assign(gpt2.trf_blocks[i].att.W_query.bias, (q_b))
        gpt2.trf_blocks[i].att.W_key.bias = assign(gpt2.trf_blocks[i].att.W_key.bias, (k_b))
        gpt2.trf_blocks[i].att.W_value.bias = assign(gpt2.trf_blocks[i].att.W_value.bias, (v_b))

        gpt2.trf_blocks[i].att.out_proj.weight = assign(gpt2.trf_blocks[i].att.out_proj.weight, (state_dict[f'transformer.h.{i}.attn.c_proj.weight']).T)
        gpt2.trf_blocks[i].att.out_proj.bias = assign(gpt2.trf_blocks[i].att.out_proj.bias, (state_dict[f'transformer.h.{i}.attn.c_proj.bias']))

        gpt2.trf_blocks[i].norm2.scale = assign(gpt2.trf_blocks[i].norm2.scale,(state_dict[f'transformer.h.{i}.ln_2.weight']))
        gpt2.trf_blocks[i].norm2.bias = assign(gpt2.trf_blocks[i].norm2.scale,(state_dict[f'transformer.h.{i}.ln_2.bias']))

        gpt2.trf_blocks[i].ff.layers[0].weight = assign(gpt2.trf_blocks[i].ff.layers[0].weight,(state_dict[f'transformer.h.{i}.mlp.c_fc.weight'].T))
        gpt2.trf_blocks[i].ff.layers[0].bias = assign(gpt2.trf_blocks[i].ff.layers[0].bias,(state_dict[f'transformer.h.{i}.mlp.c_fc.bias']))
        gpt2.trf_blocks[i].ff.layers[2].weight = assign(gpt2.trf_blocks[i].ff.layers[2].weight,(state_dict[f'transformer.h.{i}.mlp.c_proj.weight'].T))
        gpt2.trf_blocks[i].ff.layers[2].bias = assign(gpt2.trf_blocks[i].ff.layers[2].bias,(state_dict[f'transformer.h.{i}.mlp.c_proj.bias']))

    gpt2.final_norm.scale = assign(gpt2.final_norm.scale,(state_dict['transformer.ln_f.weight']))
    gpt2.final_norm.shift = assign(gpt2.final_norm.shift,(state_dict['transformer.ln_f.bias']))
    gpt2.out_head.weight = assign(gpt2.out_head.weight, (state_dict['lm_head.weight']))

In [115]:
BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs["gpt2-small (124M)"])
model = GPTModel(BASE_CONFIG)
load_weights(BASE_CONFIG["n_layers"],model)

  return torch.nn.Parameter(torch.tensor(right))


In [116]:
import pandas as pd
raw_data = pd.read_csv("Sentiment_data.csv",encoding='latin-1')

In [117]:
raw_data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,11118
positive,8582
negative,7781


In [118]:
raw_data

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


In [119]:
raw_data = raw_data[['text','sentiment']]
raw_data.rename(columns={"text":"Text","sentiment":"Label"},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data.rename(columns={"text":"Text","sentiment":"Label"},inplace=True)


In [120]:
balanced_df = (
    raw_data.groupby('Label', group_keys=False)
      .apply(lambda x: x.sample(n=500, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda x: x.sample(n=500, random_state=42))


In [121]:
balanced_df['Label'] = balanced_df['Label'].map({'negative': 0, 'neutral': 1,"positive":2})

In [122]:
balanced_df

Unnamed: 0,Text,Label
0,says BAD TRIP! (angry) http://plurk.com/p/wxshi,0
1,that sounds foreboding...,0
2,_kat I`ve begged my mum to lt me get them out...,0
3,"is really, really bored... I guess I will go t...",0
4,I just stuck my finger down my throat and ther...,0
...,...,...
1495,Happy Star Wars Day!!!,2
1496,Has just finished uploading my latest chap Fi...,2
1497,Cool...a VERY productive day! I just got a s...,2
1498,Happy Mothers day to all you MI.. mothers out ...,2


In [123]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

In [124]:
train_df, validation_df, test_df = random_split(balanced_df, 0.8, 0.1)

In [125]:
class SentenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=None, pad_token_id=50256):
        self.data = df

        # Pre-tokenize texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length

            # Truncate sequences if they are longer than max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [126]:
tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = SentenceDataset(
    df=train_df,
    max_length=None,
    tokenizer=tokenizer
)
validation_dataset = SentenceDataset(
    df=validation_df,
    max_length=None,
    tokenizer=tokenizer
)
test_dataset = SentenceDataset(
    df=test_df,
    max_length=None,
    tokenizer=tokenizer
)

In [127]:
num_workers = 0
batch_size = 4

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [128]:
# freeze all layer weights
for param in model.parameters():
    param.requires_grad = False

In [129]:
# for classification task
num_classes = 3
model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)

In [130]:
# Training only the last half of transformer layers, layernorm and output layer
for param in model.trf_blocks[-(BASE_CONFIG["n_layers"]//2):].parameters():
    param.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True

In [131]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)

            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]  # Logits of last output token
            predicted_labels = torch.argmax(logits, dim=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

In [132]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)[:, -1, :]  # Logits of last output token
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0.
  if len(data_loader) == 0:
      return float("nan")
  elif num_batches is None:
      num_batches = len(data_loader)
  else:
      # Reduce the number of batches to match the total number of batches in the data loader
      # if num_batches exceeds the number of batches in the data loader
      num_batches = min(num_batches, len(data_loader))
  for i, (input_batch, target_batch) in enumerate(data_loader):
      if i < num_batches:
          loss = calc_loss_batch(input_batch, target_batch, model, device)
          total_loss += loss.item()
      else:
          break
  return total_loss / num_batches

In [133]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                            eval_freq, eval_iter):
    # Initialize lists to track losses and examples seen
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            examples_seen += input_batch.shape[0] # New: track examples instead of tokens
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Calculate accuracy after each epoch
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [134]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_epochs = 25
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 3.598, Val loss 3.729
Ep 1 (Step 000050): Train loss 1.370, Val loss 1.271
Ep 1 (Step 000100): Train loss 1.282, Val loss 1.168
Ep 1 (Step 000150): Train loss 1.020, Val loss 1.153
Ep 1 (Step 000200): Train loss 1.203, Val loss 1.125
Ep 1 (Step 000250): Train loss 1.169, Val loss 1.147
Training accuracy: 25.00% | Validation accuracy: 35.00%
Ep 2 (Step 000300): Train loss 1.124, Val loss 1.121
Ep 2 (Step 000350): Train loss 1.098, Val loss 1.102
Ep 2 (Step 000400): Train loss 1.140, Val loss 1.100
Ep 2 (Step 000450): Train loss 1.113, Val loss 1.078
Ep 2 (Step 000500): Train loss 1.019, Val loss 1.112
Ep 2 (Step 000550): Train loss 1.045, Val loss 1.101
Training accuracy: 50.00% | Validation accuracy: 50.00%
Ep 3 (Step 000600): Train loss 1.139, Val loss 1.099
Ep 3 (Step 000650): Train loss 1.081, Val loss 1.065
Ep 3 (Step 000700): Train loss 1.084, Val loss 1.069
Ep 3 (Step 000750): Train loss 1.099, Val loss 1.061
Ep 3 (Step 000800): Train loss 1.026, Va

In [135]:
torch.save(model.state_dict(), "sentiment_analysis_gpt-small_model_weights.pth")

In [136]:
train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 86.00%
Validation accuracy: 73.33%
Test accuracy: 76.00%


In [138]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()

    # Prepare inputs to the model
    input_ids = tokenizer.encode(text)
    supported_context_length = model.pos_emb.weight.shape[0]
    # Note: In the book, this was originally written as pos_emb.weight.shape[1] by mistake
    # It didn't break the code but would have caused unnecessary truncation (to 768 instead of 1024)

    # Truncate sequences if they too long
    input_ids = input_ids[:min(max_length, supported_context_length)]

    # Pad sequences to the longest sequence
    input_ids += [pad_token_id] * (max_length - len(input_ids))
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension

    # Model inference
    with torch.no_grad():
        logits = model(input_tensor)[:, -1, :]  # Logits of the last output token
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Return the classified result
    return predicted_label

In [140]:
def predict_sentiment(predicted_label):
  if predicted_label == 0:
    return "negative"
  elif predicted_label == 1:
    return "neutral"
  else:
    return "positive"

In [141]:
text_1 = (
    "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."
)

predict_sentiment(classify_review(
    text_1, model, tokenizer, device, max_length=train_dataset.max_length
))

'negative'

In [144]:
text_2 = (
    "The product met my expectations and functioned as described."
)
predict_sentiment(classify_review(
    text_2, model, tokenizer, device, max_length=train_dataset.max_length
))

'positive'

In [145]:
text_3 = (
    "The meeting was held at 3 PM as scheduled."
)
predict_sentiment(classify_review(
    text_3, model, tokenizer, device, max_length=train_dataset.max_length
))

'neutral'