In [28]:

import os
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AdamW
import random
from transformers import get_scheduler
from torch.nn.utils import clip_grad_norm_
import torch
from tqdm import tqdm


# Section 2: Load and Sample Data

In [29]:
data = pd.read_csv('NYT_Dataset.csv')
data = data.dropna()
data = data[['abstract', 'title']]
data = data.sample(frac=1, random_state=42).reset_index(drop=True)


# Section 3: Define Custom Dataset Class

In [30]:
class NYTDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        abstract = self.data.iloc[idx]['abstract']
        title = self.data.iloc[idx]['title']
        inputs = self.tokenizer(
            abstract,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        labels = self.tokenizer(
            title,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        ).input_ids
        
        inputs['input_ids'] = inputs['input_ids'].squeeze()
        inputs['attention_mask'] = inputs['attention_mask'].squeeze()
        labels = labels.squeeze()
        
        inputs['labels'] = labels
        
        return inputs


# Section 4: Initialize Tokenizer, Dataset, and Data Loaders

In [31]:
# Initialize tokenizer and dataset
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
dataset = NYTDataset(data, tokenizer)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


# Section 5: Load Model, Optimizer, and Set Device

In [32]:

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)




BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

# Section 6: Define Checkpoint Directory and Save Function

In [33]:
# Directory to save checkpoints
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)


def save_checkpoint(epoch):
    checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch}.pt')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")


# Section 7: Load Checkpoint (for Resuming Training)

In [43]:

def load_checkpoint(model, optimizer, checkpoint_dir):
    # Find the latest checkpoint file
    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pt")]
    if not checkpoints:
        print("No checkpoint found, starting training from scratch.")
        return 0  # Start from the first epoch if no checkpoint is found
    
    latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('_')[-1].split('.')[0]))
    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
    
    try:
        # Load the checkpoint
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        print(f"Resuming training from epoch {start_epoch} using checkpoint {checkpoint_path}")
        return start_epoch
    except KeyError:
        print(f"Error loading checkpoint. The file {checkpoint_path} might not be in the expected format.")
        raise
    except RuntimeError as e:
        print(f"RuntimeError while loading checkpoint: {e}")
        raise

# Example usage
start_epoch = load_checkpoint(model, optimizer, "checkpoints")

RuntimeError while loading checkpoint: PytorchStreamReader failed reading zip archive: failed finding central directory


  checkpoint = torch.load(checkpoint_path)


RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

# Section 8: Training Loop with Checkpoint Saving

In [None]:

from transformers import get_scheduler
from torch.nn.utils import clip_grad_norm_
import torch
from tqdm import tqdm

def train_model(
    model, train_loader, val_loader, optimizer, device, initial_lr=5e-5, patience=3, max_epochs=100
):
    # Freeze the first N encoder layers initially
    freeze_layers_until = 6  
    for name, param in model.named_parameters():
        
        parts = name.split(".")
        if len(parts) > 2 and parts[0] == "model" and parts[1] == "encoder" and parts[2].isdigit():
            layer_number = int(parts[2])
            if layer_number < freeze_layers_until:
                param.requires_grad = False

    # Scheduler setup
    total_steps = len(train_loader) * max_epochs
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # self-adjusting learning rate
    best_val_loss = float('inf')
    epochs_without_improvement = 0

    
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(max_epochs):
        model.train()
        print(f"Epoch {epoch + 1}/{max_epochs}")
        progress_bar = tqdm(train_loader, desc="Training", leave=True)

        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}

            
            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                loss = outputs.loss

            
            scaler.scale(loss).backward()

            # Gradient clipping
            clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Optimizer step
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

            progress_bar.set_postfix(loss=loss.item())

        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss after Epoch {epoch + 1}: {avg_val_loss}")

        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch + 1  # Save the current epoch
            }, f"checkpoints/best_model_epoch_{epoch + 1}.pt")
            print(f"Saved new best model with validation loss {best_val_loss:.4f}")
        else:
            epochs_without_improvement += 1
            print(f"No improvement for {epochs_without_improvement} epoch(s).")

        
        if epochs_without_improvement >= patience:
            
            current_lr = optimizer.param_groups[0]['lr']
            new_lr = current_lr * 0.5
            for param_group in optimizer.param_groups:
                param_group['lr'] = new_lr
            print(f"Reducing learning rate to {new_lr:.6f}")
            epochs_without_improvement = 0

        # Unfreeze one additional layer after every 10 epochs
        if epoch % 10 == 0 and freeze_layers_until > 0:
            freeze_layers_until -= 1
            for name, param in model.named_parameters():
                parts = name.split(".")
                if len(parts) > 2 and parts[0] == "model" and parts[1] == "encoder" and parts[2].isdigit():
                    layer_number = int(parts[2])
                    if layer_number < freeze_layers_until:
                        param.requires_grad = True
            print(f"Unfrozen an additional layer. Now training layers >= {freeze_layers_until}.")

        # Early stopping
        if avg_val_loss > best_val_loss and epochs_without_improvement >= patience * 2:
            print("Stopping early due to lack of improvement.")
            break


In [38]:
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    initial_lr=5e-5,
    patience=5,
    max_epochs=100
)

  scaler = torch.cuda.amp.GradScaler()


Epoch 1/100


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 5297/5297 [08:38<00:00, 10.22it/s, loss=0.519]


Validation Loss after Epoch 1: 0.31702273614001725
Saved new best model with validation loss 0.3170
Unfrozen an additional layer. Now training layers >= 5.
Epoch 2/100


Training: 100%|██████████| 5297/5297 [08:36<00:00, 10.26it/s, loss=0.244]


Validation Loss after Epoch 2: 0.31132992284477884
Saved new best model with validation loss 0.3113
Epoch 3/100


Training: 100%|██████████| 5297/5297 [08:34<00:00, 10.30it/s, loss=0.361]


Validation Loss after Epoch 3: 0.3066443328812437
Saved new best model with validation loss 0.3066
Epoch 4/100


Training: 100%|██████████| 5297/5297 [11:31<00:00,  7.66it/s, loss=0.461]


Validation Loss after Epoch 4: 0.3047685994067282
Saved new best model with validation loss 0.3048
Epoch 5/100


Training: 100%|██████████| 5297/5297 [11:03<00:00,  7.99it/s, loss=0.336]


Validation Loss after Epoch 5: 0.3045579712908223
Saved new best model with validation loss 0.3046
Epoch 6/100


Training: 100%|██████████| 5297/5297 [10:56<00:00,  8.07it/s, loss=0.547]


Validation Loss after Epoch 6: 0.30422682942084545
Saved new best model with validation loss 0.3042
Epoch 7/100


Training: 100%|██████████| 5297/5297 [11:15<00:00,  7.84it/s, loss=0.5]  


Validation Loss after Epoch 7: 0.303460969025234
Saved new best model with validation loss 0.3035
Epoch 8/100


Training: 100%|██████████| 5297/5297 [11:21<00:00,  7.77it/s, loss=0.254]


Validation Loss after Epoch 8: 0.3029055205493603
Saved new best model with validation loss 0.3029
Epoch 9/100


Training: 100%|██████████| 5297/5297 [10:57<00:00,  8.06it/s, loss=0.304]


Validation Loss after Epoch 9: 0.30266699174665057
Saved new best model with validation loss 0.3027
Epoch 10/100


Training: 100%|██████████| 5297/5297 [10:57<00:00,  8.06it/s, loss=0.317]


Validation Loss after Epoch 10: 0.3022134920898473
Saved new best model with validation loss 0.3022
Epoch 11/100


Training: 100%|██████████| 5297/5297 [10:58<00:00,  8.04it/s, loss=0.422]


Validation Loss after Epoch 11: 0.3022057004694669
Saved new best model with validation loss 0.3022
Unfrozen an additional layer. Now training layers >= 4.
Epoch 12/100


Training: 100%|██████████| 5297/5297 [11:17<00:00,  7.81it/s, loss=0.183]


Validation Loss after Epoch 12: 0.301158615260754
Saved new best model with validation loss 0.3012
Epoch 13/100


Training: 100%|██████████| 5297/5297 [11:36<00:00,  7.60it/s, loss=0.294]


Validation Loss after Epoch 13: 0.3009983653842278
Saved new best model with validation loss 0.3010
Epoch 14/100


Training:  33%|███▎      | 1760/5297 [03:50<07:43,  7.63it/s, loss=0.388]


KeyboardInterrupt: 

# Section 9: Prediction Function

In [42]:
def predict_random_title(data,model,tokenizer,device):
    random_sample = data.sample(1).iloc[0]
    abstract = random_sample['abstract']
    actual_title = random_sample['title']
    inputs = tokenizer(abstract, max_length=128, padding='max_length', truncation=True, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        predicted_title = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=128, num_beams=5, early_stopping=True)
    predicted_title = tokenizer.decode(predicted_title[0], skip_special_tokens=True)
    # Print the results for comparison
    print(f"Abstract:\n{abstract}\n")
    print(f"Actual Title:\n{actual_title}\n")
    print(f"Predicted Title:\n{predicted_title}\n")

# Test the model with a random abstract from the dataset
predict_random_title(data, model, tokenizer, device)

Abstract:
Siamak Namazi, a consultant who has advocated better relations between the United States and Iran, was arrested in Tehran around Oct. 15, according to people close to him.

Actual Title:
Iran Is Said to Detain Another Iranian-American

Predicted Title:
Iran Arrests U.S. Diplomat

