In [None]:
!pip install torch numpy transformers datasets tiktoken wandb tqdm

In [None]:
import os
import pandas as pd
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import math
import inspect
from dataclasses import dataclass
import torch.nn as nn
from torch.nn import functional as F
from torch.cuda.amp import GradScaler
from contextlib import nullcontext
import time
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import tiktoken
import wandb

In [None]:
# WandB login for monitoring training with parameters
wandb.login()

In [None]:
# Model architecture
# The original model architecture from the github repo was for text generation
# Here, the architecture is modified for sentiment analysis. Added sentiment head for classification
# Also, instead of loading a GPT2 model for text generation, GPT2 model for sequence classification is added for the GPT2 configurations

class LayerNorm(nn.Module):

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()

        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True
    num_classes: int = 3 # Added number of labels parameter

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))

        self.classifier = nn.Linear(config.n_embd, config.num_classes) # Modified for classification

        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        # Classification head
        x = x[:, -1, :]  # take the last token's representation
        logits = self.classifier(x)  # project to classification logits

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def crop_block_size(self, block_size):
        assert block_size <= self.config.block_size
        self.config.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        override_args = override_args or {}
        assert all(k == 'dropout' for k in override_args)
        from transformers import GPT2ForSequenceClassification # GPT2 for sequence classification is added instead of texthead due to key mismatch error
        print("loading weights from pretrained gpt: %s" % model_type)

        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024),
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280),
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600),
        }[model_type]

        print("forcing vocab_size=50257, block_size=256, bias=True")
        config_args['vocab_size'] = 50257
        config_args['block_size'] = 256
        config_args['bias'] = True
        config_args['num_classes'] = 3  # Add this for classification

        if 'dropout' in override_args:
            print(f"overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']

        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()

        # Load the pre-trained model, this is changed for sentiment analysis
        num_labels = override_args.get("num_classes", 3)
        model_hf = GPT2ForSequenceClassification.from_pretrained(model_type, num_labels=num_labels)
        sd_hf = model_hf.state_dict()

        hf_to_our_map = {
            'transformer.wte.': 'transformer.wte.',
            'transformer.wpe.': 'transformer.wpe.',
            'transformer.h.': 'transformer.h.',
            'transformer.ln_f.': 'transformer.ln_f.',
            'classifier.': 'classifier.'  # Map classification head
        }

        for k_hf in sd_hf.keys():
            if k_hf.endswith('.attn.masked_bias') or k_hf.endswith('.attn.bias'):
                continue

            k_our = k_hf
            for hf_prefix, our_prefix in hf_to_our_map.items():
                if k_hf.startswith(hf_prefix):
                    k_our = k_hf.replace(hf_prefix, our_prefix)
                    break

            if k_our in sd:
                if any(k_our.endswith(w) for w in ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']):
                    # Handle transposed weights
                    if sd_hf[k_hf].shape[::-1] == sd[k_our].shape:
                        with torch.no_grad():
                            sd[k_our].copy_(sd_hf[k_hf].t())
                else:
                    # Regular weights
                    if sd_hf[k_hf].shape == sd[k_our].shape:
                        with torch.no_grad():
                            sd[k_our].copy_(sd_hf[k_hf])

        print("Loaded GPT2 for sequence classification")
        return model

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
        N = self.get_num_params()
        cfg = self.config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        flops_achieved = flops_per_iter * (1.0/dt)
        flops_promised = 312e12
        mfu = flops_achieved / flops_promised
        return mfu


The following part is for data loading and tokenizing for NanoGPT, training, and evaluation

In [None]:
# Character level data preparation and tokenization for NanoGPT

train_data_path = "prep_train_final.csv"
val_data_path = "prep_val_final.csv"
test_data_path = "prep_test_final.csv"
batch_size = 12
block_size = 1024

def load_data(csv_file):
    df = pd.read_csv(csv_file)
    texts = df['conversation'].astype(str).tolist()
    labels = df['customer_sentiment'].tolist()
    return texts, labels

train_texts, train_labels = load_data(train_data_path)
val_texts, val_labels = load_data(val_data_path)
test_texts, test_labels = load_data(test_data_path)

all_texts = train_texts + val_texts
data = ' '.join(all_texts)
chars = sorted(list(set(data)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

print("All unique characters:")
print(''.join(chars))
print(f"Vocabulary size: {vocab_size}")

def encode(text):
    return [stoi[c] for c in text if c in stoi]

train_ids = [encode(text) for text in train_texts]
val_ids = [encode(text) for text in val_texts]
test_ids = [encode(text) for text in test_texts]

def pad_or_truncate(data, block_size):
    return [seq[:block_size] if len(seq) > block_size else seq + [0] * (block_size - len(seq)) for seq in data]

train_ids = pad_or_truncate(train_ids, block_size)
val_ids = pad_or_truncate(val_ids, block_size)
test_ids = pad_or_truncate(test_ids, block_size)

train_ids = torch.tensor(train_ids, dtype=torch.long)
val_ids = torch.tensor(val_ids, dtype=torch.long)
test_ids = torch.tensor(test_ids, dtype=torch.long)

label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = torch.tensor([label_map[label.lower()] for label in train_labels], dtype=torch.long)
val_labels = torch.tensor([label_map[label.lower()] for label in val_labels], dtype=torch.long)
test_labels = torch.tensor([label_map[label.lower()] for label in test_labels], dtype=torch.long)

train_dataset = TensorDataset(train_ids, train_labels)
val_dataset = TensorDataset(val_ids, val_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Number of training samples: {len(train_ids)}")
print(f"Number of validation samples: {len(val_ids)}")
print(f"Number of test samples: {len(test_ids)}")

In [None]:
# Training configuration and loop for training NanoGPT from scratch
# train.py and prepare.py are unified, added early stopping

init_from = 'scratch' # For training NanoGPT from scratch
out_dir = 'out_sent_char'
os.makedirs(out_dir, exist_ok=True)

eval_interval = 100
log_interval = 50
eval_iters = 200
max_iters = 2000
eval_only = False
always_save_checkpoint = False

gradient_accumulation_steps = 2
batch_size = batch_size
block_size = block_size

n_layer = 4
n_head = 4
n_embd = 256
dropout = 0.0
bias = True
num_labels = 3

learning_rate = 1e-5
weight_decay = 1e-1
decay_lr = True
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0

warmup_iters = 500
lr_decay_iters = 600000
min_lr = 6e-5

backend = 'nccl'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'

wandb_log = True
wandb_project = 'sentiment_analysis_char'
wandb_run_name = 'classification_from_scratch' + str(time.time())

if wandb_log:
    wandb.init(
        project=wandb_project,
        name=wandb_run_name,
        config={
            'init_from': init_from,
            'n_layer': n_layer,
            'n_head': n_head,
            'n_embd': n_embd,
            'dropout': dropout,
            'num_labels': num_labels,
            'max_iters': max_iters,
            'eval_interval': eval_interval,
            'log_interval': log_interval,
            'eval_iters': eval_iters,
            'learning_rate': learning_rate,
            'weight_decay': weight_decay,
            'beta1': beta1,
            'beta2': beta2,
            'grad_clip': grad_clip,
            'gradient_accumulation_steps': gradient_accumulation_steps,
            'batch_size': batch_size,
            'block_size': block_size,
            'decay_lr': decay_lr,
            'warmup_iters': warmup_iters,
            'min_lr' : min_lr,
        }
    )

ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
    seed_offset = ddp_rank
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'cuda' if 'cuda' in device else 'cpu'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# Load Data
train_iter = iter(train_loader)
val_iter = iter(val_loader)

def get_batch(split):
    global train_iter, val_iter

    iterator = train_iter if split == 'train' else val_iter

    try:
        x, y = next(iterator)
    except StopIteration:
        if split == 'train':
            train_iter = iter(train_loader)
            x, y = next(train_iter)
        else:
            val_iter = iter(val_loader)
            x, y = next(val_iter)

    return x.to(device), y.to(device)

# Initialize model
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd,
                  block_size=block_size, bias=bias, vocab_size=None,
                  dropout=dropout)

if init_from == 'scratch':
    print("Initializing a new model from scratch")
    if vocab_size is None: # Vocab size is get from the initial dataset read
        print("defaulting to vocab_size of GPT-2: 50304 (50257 rounded up for efficiency)")
    model_args['vocab_size'] = vocab_size if vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif init_from == 'resume':
    print(f"Resuming training from {out_dir}")
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
    override_args = dict(dropout=dropout)
    model = GPT.from_pretrained(init_from, override_args)
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = getattr(model.config, k)
if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size
model.to(device)

scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

# Optimizer
optimizer = optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None

# wrap model into DDP container
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        total_loss = 0
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            _, loss = model(xb, yb)
            total_loss += loss.item()

        out[split] = total_loss / len(loader)

    model.train()
    return out

def get_lr(it):
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    if it > lr_decay_iters:
        return min_lr
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

val_loss_list = []
early_stop_patience = 3
best_val_loss = float('inf')
iter_num = 0
t0 = time.time()
local_iter_num = 0
raw_model = model.module if ddp else model
running_mfu = -1.0
X, Y = get_batch('train')

def check_early_stopping(val_loss):
    val_loss_list.append(val_loss)
    if len(val_loss_list) > early_stop_patience:
        val_loss_list.pop(0)
        if all(val_loss >= prev for prev in val_loss_list):
            print(f"Early stopping: No improvement in validation loss for {early_stop_patience} evaluations.")
            return True
    return False

def save_checkpoint(val_loss, iter_num):
    global best_val_loss
    if val_loss < best_val_loss or always_save_checkpoint:
        best_val_loss = val_loss
        checkpoint = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'model_args': model_args,
            'iter_num': iter_num,
            'best_val_loss': best_val_loss,
        }
        checkpoint_path = os.path.join(out_dir, 'ckpt.pt')
        torch.save(checkpoint, checkpoint_path)
        print(f"Checkpoint saved at iteration {iter_num}, validation loss: {val_loss:.4f}")

while iter_num <= max_iters:
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        val_loss = losses['val']
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {val_loss:.4f}")
        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": lr,
                "mfu": running_mfu*100,
            })

        if check_early_stopping(val_loss):
            break

        save_checkpoint(val_loss, iter_num)

    for micro_step in range(gradient_accumulation_steps):
        if ddp:
            model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps
        X, Y = get_batch('train')
        scaler.scale(loss).backward()

    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5:
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    if iter_num > max_iters:
        break

if wandb_log:
    wandb.finish()

In [None]:
# Evaluation code for trained NanoGPT
# Includes Accuracy, Class-wise accuracy (3 classes), F1-score, and confusion matrix

checkpoint = torch.load('out_sent_char/ckpt.pt', map_location=device)
state_dict = checkpoint['model']
model.load_state_dict(state_dict)

model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        logits, _ = model(inputs)
        predicted_labels = logits.argmax(dim=-1).cpu().numpy()

        predictions.extend(predicted_labels)
        true_labels.extend(labels.cpu().numpy())

conf_matrix = confusion_matrix(true_labels, predictions)

overall_accuracy = accuracy_score(true_labels, predictions)
print(f"\nOverall Accuracy: {overall_accuracy * 100:.2f}%")

class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
class_labels = ['positive', 'neutral', 'negative']
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
inv_label_map = {v: k for k, v in label_map.items()}

print("\nClass-wise Accuracy:")
for i, acc in enumerate(class_accuracy):
    label_name = inv_label_map[i]
    print(f"  Class {i} ({label_name}): {acc * 100:.2f}%")

print("\nClassification Report:")
report = classification_report(true_labels, predictions, target_names=[inv_label_map[i] for i in range(3)], zero_division=0)
print(report)

ordered_labels = ['negative', 'neutral', 'positive']
ordered_indices = [label_map[label] for label in ordered_labels]
reordered_conf_matrix = conf_matrix[np.ix_(ordered_indices, ordered_indices)]

print("\nConfusion Matrix:")
print("{:<10}".format(""), end="")
for label in ordered_labels:
    print("{:<10}".format(label), end="")
print()
for i, row_label in enumerate(ordered_labels):
    print("{:<10}".format(row_label), end="")
    for j in range(len(ordered_labels)):
        print("{:<10}".format(reordered_conf_matrix[i][j]), end="")
    print()

plt.figure(figsize=(8, 6))
sns.heatmap(reordered_conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=ordered_labels, yticklabels=ordered_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()


The following part is for data loading and tokenizing for GPT-2, fine-tuning, and evaluation

In [None]:
# Data preparation and tokenization for gpt2. Added gpt2 tokenizer instead of character level tokenization

train_data_path = "prep_train_final.csv"
val_data_path = "prep_val_final.csv"
test_data_path = "prep_test_final.csv"

block_size = 256
batch_size = 24

def load_data(csv_file):
    df = pd.read_csv(csv_file)
    texts = df['conversation'].astype(str).tolist()
    labels = df['customer_sentiment'].tolist()
    return texts, labels

train_texts, train_labels = load_data(train_data_path)
val_texts, val_labels = load_data(val_data_path)
test_texts, test_labels = load_data(test_data_path)

enc = tiktoken.get_encoding("gpt2")

def encode(text):
    return enc.encode_ordinary(text)

train_ids = [encode(text) for text in train_texts]
val_ids = [encode(text) for text in val_texts]
test_ids = [encode(text) for text in test_texts]

def pad_or_truncate(data, block_size):
    return [seq[:block_size] if len(seq) > block_size else seq + [enc.eot_token] * (block_size - len(seq)) for seq in data]

train_ids = pad_or_truncate(train_ids, block_size)
val_ids = pad_or_truncate(val_ids, block_size)
test_ids = pad_or_truncate(test_ids, block_size)

train_ids = torch.tensor(train_ids, dtype=torch.long)
val_ids = torch.tensor(val_ids, dtype=torch.long)
test_ids = torch.tensor(test_ids, dtype=torch.long)

label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = torch.tensor([label_map[label.lower()] for label in train_labels], dtype=torch.long)
val_labels = torch.tensor([label_map[label.lower()] for label in val_labels], dtype=torch.long)
test_labels = torch.tensor([label_map[label.lower()] for label in test_labels], dtype=torch.long)

train_dataset = TensorDataset(train_ids, train_labels)
val_dataset = TensorDataset(val_ids, val_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Number of training samples: {len(train_ids)}")
print(f"Number of validation samples: {len(val_ids)}")
print(f"Number of test samples: {len(test_ids)}")

In [None]:
# Training configuration and loop for fine-tuning GPT2, the variables are also changed for fine-tuning
# train.py and prepare.py are unified, added early stopping

init_from = 'gpt2' # For fine-tuning GPT2
out_dir = 'out_sent_fineTune'
os.makedirs(out_dir, exist_ok=True)

eval_interval = 25
log_interval = 1
eval_iters = 25
max_iters = 50

n_layer = 12
n_head = 12
n_embd = 768

bias = False
learning_rate = 1e-5
weight_decay = 1e-1
decay_lr = False
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0

batch_size = batch_size
block_size = block_size

always_save_checkpoint = False

dropout = 0.8
gradient_accumulation_steps = 32

learning_rate = 1e-5

backend = 'nccl'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'

wandb_log = True
wandb_project = 'sentiment_analysis_gpt2'
wandb_run_name = 'classification_fineTune' + str(time.time())

if wandb_log:
    wandb.init(
        project=wandb_project,
        name=wandb_run_name,
        config={
            'init_from': init_from,
            'dropout': dropout,
            'max_iters': max_iters,
            'decay_lr': decay_lr,
            'eval_interval': eval_interval,
            'log_interval': log_interval,
            'eval_iters': eval_iters,
            'max_iters': max_iters,
            'learning_rate': learning_rate,
            'gradient_accumulation_steps': gradient_accumulation_steps,
            'batch_size': batch_size,
        }
    )

ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
    seed_offset = ddp_rank
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'cuda' if 'cuda' in device else 'cpu'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# Load Data
train_iter = iter(train_loader)
val_iter = iter(val_loader)

def get_batch(split):
    global train_iter, val_iter

    iterator = train_iter if split == 'train' else val_iter

    try:
        x, y = next(iterator)
    except StopIteration:
        if split == 'train':
            train_iter = iter(train_loader)
            x, y = next(train_iter)
        else:
            val_iter = iter(val_loader)
            x, y = next(val_iter)

    return x.to(device), y.to(device)

# Initialize model
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd,
                  block_size=block_size, bias=bias, vocab_size=None,
                  dropout=dropout)

if init_from == 'scratch':
    print("Initializing a new model from scratch")
    if vocab_size is None: # Vocab size is get from the initial dataset read
        print("defaulting to vocab_size of GPT-2: 50304 (50257 rounded up for efficiency)")
    model_args['vocab_size'] = vocab_size if vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif init_from == 'resume':
    print(f"Resuming training from {out_dir}")
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
    override_args = dict(dropout=dropout)
    model = GPT.from_pretrained(init_from, override_args)
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = getattr(model.config, k)
if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size
model.to(device)

scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

# Optimizer
optimizer = optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None

# wrap model into DDP container
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        total_loss = 0
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            _, loss = model(xb, yb)
            total_loss += loss.item()

        out[split] = total_loss / len(loader)

    model.train()
    return out

def get_lr(it):
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    if it > lr_decay_iters:
        return min_lr
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

val_loss_list = []
early_stop_patience = 3
best_val_loss = float('inf')
iter_num = 0
local_iter_num = 0
t0 = time.time()
raw_model = model.module if ddp else model
running_mfu = -1.0
X, Y = get_batch('train')

def check_early_stopping(val_loss):
    val_loss_list.append(val_loss)
    if len(val_loss_list) > early_stop_patience:
        val_loss_list.pop(0)
        if all(val_loss >= prev for prev in val_loss_list):
            print(f"Early stopping: No improvement in validation loss for {early_stop_patience} evaluations.")
            return True
    return False

def save_checkpoint(val_loss, iter_num):
    global best_val_loss
    if val_loss < best_val_loss or always_save_checkpoint:
        best_val_loss = val_loss
        checkpoint = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'model_args': model_args,
            'iter_num': iter_num,
            'best_val_loss': best_val_loss,
        }
        checkpoint_path = os.path.join(out_dir, 'ckpt.pt')
        torch.save(checkpoint, checkpoint_path)
        print(f"Checkpoint saved at iteration {iter_num}, validation loss: {val_loss:.4f}")

while iter_num <= max_iters:
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        val_loss = losses['val']
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {val_loss:.4f}")
        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": lr,
                "mfu": running_mfu*100,
            })

        if check_early_stopping(val_loss):
            break

        save_checkpoint(val_loss, iter_num)

    for micro_step in range(gradient_accumulation_steps):
        if ddp:
            model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps
        X, Y = get_batch('train')
        scaler.scale(loss).backward()

    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5:
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    if iter_num > max_iters:
        break

if wandb_log:
    wandb.finish()

In [None]:
# Evaluation code for fine-tuned GPT2
# Includes Accuracy, Class-wise accuracy (3 classes), F1-score, and confusion matrix

checkpoint = torch.load('out_sent_fineTune/ckpt.pt', map_location=device)
state_dict = checkpoint['model']
model.load_state_dict(state_dict)

model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        logits, _ = model(inputs)
        predicted_labels = logits.argmax(dim=-1).cpu().numpy()

        predictions.extend(predicted_labels)
        true_labels.extend(labels.cpu().numpy())

conf_matrix = confusion_matrix(true_labels, predictions)

overall_accuracy = accuracy_score(true_labels, predictions)
print(f"\nOverall Accuracy: {overall_accuracy * 100:.2f}%")

class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
class_labels = ['positive', 'neutral', 'negative']
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
inv_label_map = {v: k for k, v in label_map.items()}

print("\nClass-wise Accuracy:")
for i, acc in enumerate(class_accuracy):
    label_name = inv_label_map[i]
    print(f"  Class {i} ({label_name}): {acc * 100:.2f}%")

print("\nClassification Report:")
report = classification_report(true_labels, predictions, target_names=[inv_label_map[i] for i in range(3)], zero_division=0)
print(report)

ordered_indices = [label_map[label] for label in class_labels]
reordered_conf_matrix = conf_matrix[np.ix_(ordered_indices, ordered_indices)]

print("\nConfusion Matrix:")
print("{:<10}".format(""), end="")
for label in class_labels:
    print("{:<10}".format(label), end="")
print()
for i, row_label in enumerate(class_labels):
    print("{:<10}".format(row_label), end="")
    for j in range(len(class_labels)):
        print("{:<10}".format(reordered_conf_matrix[i][j]), end="")
    print()

plt.figure(figsize=(8, 6))
sns.heatmap(reordered_conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()
