In [2]:

import os
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AdamW
import random
from transformers import get_scheduler
from torch.nn.utils import clip_grad_norm_
import torch
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


# Section 2: Load and Sample Data

In [3]:
data = pd.read_csv('NYT_Dataset.csv')
data = data.dropna()
data = data[['abstract', 'title']]
data = data.sample(frac=1, random_state=42).reset_index(drop=True)


# Section 3: Define Custom Dataset Class

In [4]:
class NYTDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        abstract = self.data.iloc[idx]['abstract']
        title = self.data.iloc[idx]['title']
        inputs = self.tokenizer(
            abstract,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        labels = self.tokenizer(
            title,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        ).input_ids
        
        inputs['input_ids'] = inputs['input_ids'].squeeze()
        inputs['attention_mask'] = inputs['attention_mask'].squeeze()
        labels = labels.squeeze()
        
        inputs['labels'] = labels
        
        return inputs


# Section 4: Initialize Tokenizer, Dataset, and Data Loaders

In [5]:
# Initialize tokenizer and dataset
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
dataset = NYTDataset(data, tokenizer)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


# Section 5: Load Model, Optimizer, and Set Device

In [None]:

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


OSError: best_model_epoch_3.pt is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

# Section 6: Define Checkpoint Directory and Save Function

In [19]:
# Directory to save checkpoints
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)


def save_checkpoint(epoch):
    checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch}.pt')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")


# Section 7: Load Checkpoint (for Resuming Training)

In [None]:

def load_checkpoint(model, optimizer, checkpoint_dir):
    # Find the latest checkpoint file
    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pt")]
    if not checkpoints:
        print("No checkpoint found, starting training from scratch.")
        return 0  # Start from the first epoch if no checkpoint is found
    
    latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('_')[-1].split('.')[0]))
    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
    
    try:
        # Load the checkpoint
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        print(f"Resuming training from epoch {start_epoch} using checkpoint {checkpoint_path}")
        return start_epoch
    except KeyError:
        print(f"Error loading checkpoint. The file {checkpoint_path} might not be in the expected format.")
        raise
    except RuntimeError as e:
        print(f"RuntimeError while loading checkpoint: {e}")
        raise

# Example usage
start_epoch = load_checkpoint(model, optimizer, "checkpoints")

  checkpoint = torch.load(checkpoint_path)


RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

# Section 8: Training Loop with Checkpoint Saving

In [23]:

from transformers import get_scheduler
from torch.nn.utils import clip_grad_norm_
import torch
from tqdm import tqdm

def train_model(
    model, train_loader, val_loader, optimizer, device, initial_lr=5e-5, patience=3, max_epochs=100
):
    # Freeze the first N encoder layers initially
    freeze_layers_until = 6  
    for name, param in model.named_parameters():
        
        parts = name.split(".")
        if len(parts) > 2 and parts[0] == "model" and parts[1] == "encoder" and parts[2].isdigit():
            layer_number = int(parts[2])
            if layer_number < freeze_layers_until:
                param.requires_grad = False

    # Scheduler setup
    total_steps = len(train_loader) * max_epochs
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # self-adjusting learning rate
    best_val_loss = float('inf')
    epochs_without_improvement = 0

    
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(max_epochs):
        model.train()
        print(f"Epoch {epoch + 1}/{max_epochs}")
        progress_bar = tqdm(train_loader, desc="Training", leave=True)

        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}

            
            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                loss = outputs.loss

            
            scaler.scale(loss).backward()

            # Gradient clipping
            clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Optimizer step
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

            progress_bar.set_postfix(loss=loss.item())

        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss after Epoch {epoch + 1}: {avg_val_loss}")

        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), f"best_model_epoch_{epoch + 1}.pt")
            print(f"Saved new best model with validation loss {best_val_loss:.4f}")
        else:
            epochs_without_improvement += 1
            print(f"No improvement for {epochs_without_improvement} epoch(s).")

        
        if epochs_without_improvement >= patience:
            
            current_lr = optimizer.param_groups[0]['lr']
            new_lr = current_lr * 0.5
            for param_group in optimizer.param_groups:
                param_group['lr'] = new_lr
            print(f"Reducing learning rate to {new_lr:.6f}")
            epochs_without_improvement = 0

        # Unfreeze one additional layer after every 10 epochs
        if epoch % 10 == 0 and freeze_layers_until > 0:
            freeze_layers_until -= 1
            for name, param in model.named_parameters():
                parts = name.split(".")
                if len(parts) > 2 and parts[0] == "model" and parts[1] == "encoder" and parts[2].isdigit():
                    layer_number = int(parts[2])
                    if layer_number < freeze_layers_until:
                        param.requires_grad = True
            print(f"Unfrozen an additional layer. Now training layers >= {freeze_layers_until}.")

        # Early stopping
        if avg_val_loss > best_val_loss and epochs_without_improvement >= patience * 2:
            print("Stopping early due to lack of improvement.")
            break


In [24]:
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    initial_lr=5e-5,
    patience=5,
    max_epochs=100
)

  scaler = torch.cuda.amp.GradScaler()


Epoch 1/100


  with torch.cuda.amp.autocast():
Training:   1%|          | 49/5297 [00:05<10:36,  8.25it/s, loss=12.9] 


KeyboardInterrupt: 

# Section 9: Prediction Function

In [None]:
def predict_random_title(data,model,tokenizer,device):
    random_sample = data.sample(1).iloc[0]
    abstract = random_sample['abstract']
    actual_title = random_sample['title']
    inputs = tokenizer(abstract, max_length=128, padding='max_length', truncation=True, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        predicted_title = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=128, num_beams=5, early_stopping=True)
    predicted_title = tokenizer.decode(predicted_title[0], skip_special_tokens=True)
    # Print the results for comparison
    print(f"Abstract:\n{abstract}\n")
    print(f"Actual Title:\n{actual_title}\n")
    print(f"Predicted Title:\n{predicted_title}\n")

# Test the model with a random abstract from the dataset
predict_random_title(data, model, tokenizer, device)

Abstract:
A newly disclosed exchange demonstrated the potential for conflicts to have arisen as part of Huma Abedin’s status as a special government employee.

Actual Title:
Emails Show Hillary Clinton Adviser Sidestepping Potential Conflict

Predicted Title:
A newly disclosed exchange demonstrated the potential for conflicts to have arisen as part of Huma Abedin’s status as a special government employee.

