In [16]:
import torch

# Check if CUDA is available
print("CUDA Available: ", torch.cuda.is_available())

# Get the name of the CUDA device 
print("CUDA Device Name: ", torch.cuda.get_device_name(0))

# Set PyTorch to use the CUDA device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

CUDA Available:  True
CUDA Device Name:  NVIDIA A100 80GB PCIe
Using device: cuda


#### Step 1: Setup Environment and Install Dependencies

In [17]:
import re
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments

#### Step 2: Load and Prepare Data

In [3]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
dataset = load_dataset('ccdv/cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/home/azureuser/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Randomly sample 25% of each dataset split
dataset['train'] = dataset['train'].shuffle(seed=42).select(range(int(len(dataset['train']) * 0.25)))
dataset['validation'] = dataset['validation'].shuffle(seed=42).select(range(int(len(dataset['validation']) * 0.25)))
dataset['test'] = dataset['test'].shuffle(seed=42).select(range(int(len(dataset['test']) * 0.25)))

Loading cached shuffled indices for dataset at /home/azureuser/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f/cache-9e4a37197c27169c.arrow
Loading cached shuffled indices for dataset at /home/azureuser/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f/cache-bc79b6d764210737.arrow
Loading cached shuffled indices for dataset at /home/azureuser/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f/cache-5e4f6217fa60443c.arrow


In [18]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def handle_special_content(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Optional: Remove or substitute certain other non-textual elements if present
    return text

def segment_text(tokenized_text, max_length=1024):
    # This function assumes that the text has already been tokenized and is too long
    return [tokenized_text[i:i + max_length] for i in range(0, len(tokenized_text), max_length)]

In [9]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [19]:
def preprocess_and_tokenize(examples):
    # Clean and handle special content
    cleaned_articles = [clean_text(article) for article in examples['article']]
    cleaned_articles = [handle_special_content(article) for article in cleaned_articles]
    
    # Tokenize articles
    model_inputs = tokenizer(cleaned_articles, max_length=1024, truncation=True, padding="max_length")
    
    # Tokenize highlights
    cleaned_highlights = [clean_text(highlight) for highlight in examples['highlights']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(cleaned_highlights, max_length=1024, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
# Apply preprocessing and tokenization to the dataset
tokenized_datasets = dataset.map(preprocess_and_tokenize, batched=True)



  0%|          | 0/72 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

#### Step 4: Define Evaluation Metrics

In [12]:
from datasets import load_metric

rouge = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Sometimes you might need to further process decoded_preds and decoded_labels to remove padding or unwanted tokens
    # Let's assume decoded_preds and decoded_labels are lists of strings (summaries)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]  # ROUGE expects a list of references for each prediction

    # Calculate ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    # Extract a few particular scores to return
    result = {key: value.mid.fmeasure for key, value in result.items()}  # mid.fmeasure gives the F1 score
    return result

#### Step 5: Training the Model

In [13]:
from transformers import BartForConditionalGeneration

# Loading a distilled version of BART
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-6-6')

In [20]:
import torch

# Clear cache
torch.cuda.empty_cache()

In [16]:
import torch

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up training arguments with mixed precision and possible gradient accumulation
training_args = TrainingArguments(
    output_dir='./results',                    # Output directory for model checkpoints
    num_train_epochs=3,                        # Number of training epochs
    per_device_train_batch_size=16,            # Batch size per device during training
    per_device_eval_batch_size=16,             # Batch size for evaluation
    warmup_steps=500,                          # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                         # Weight decay for regularization
    logging_dir='./logs',                      # Directory for storing logs
    logging_steps=50,
    evaluation_strategy="steps",               # Evaluate at the end of each epoch
    eval_steps=500,
    save_steps=500,
    save_strategy="steps",                     # Save the model at the end of each epoch
    fp16=True,                                 # Enable mixed precision training (requires NVIDIA GPU with Tensor Cores)
    gradient_accumulation_steps=16,             # Adjust based on your GPU memory capacity and batch size
    load_best_model_at_end=True,               # Load the best model found during training at the end
    metric_for_best_model='eval_loss',         # Use eval loss to determine the best model
    greater_is_better=False,                   # Lower eval loss is better
    save_total_limit=3
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics  # Make sure to define compute_metrics function if you're using it
)

# Clear any cached memory to maximize available GPU memory before training starts
torch.cuda.empty_cache()

# Start the training process
trainer.train()

# Optionally, you can save the model manually if needed
model.save_pretrained('./final_model_summarization')
tokenizer.save_pretrained('./final_model_summarization')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: article, highlights, id.
***** Running training *****
  Num examples = 71778
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 16
  Total optimization steps = 420


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in ./final_model_summarization/config.json
Model weights saved in ./final_model_summarization/pytorch_model.bin
tokenizer config file saved in ./final_model_summarization/tokenizer_config.json
Special tokens file saved in ./final_model_summarization/special_tokens_map.json


('./final_model_summarization/tokenizer_config.json',
 './final_model_summarization/special_tokens_map.json',
 './final_model_summarization/vocab.json',
 './final_model_summarization/merges.txt',
 './final_model_summarization/added_tokens.json')

#### 

In [None]:
from azureml.core import Workspace
ws = Workspace.from_config()  # Or use .get() with explicit parameters
compute_target = ws.compute_targets['bkathuri2']
compute_target.stop(show_output=True)

In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_metric, load_dataset

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained('./final_model_summarization')

# Load the model
model = BartForConditionalGeneration.from_pretrained('./final_model_summarization')


In [20]:
def generate_summary(text, tokenizer, model, device):
    # Preprocess and tokenize the text
    text = clean_text(text)
    text = handle_special_content(text)
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")

    # Move tensors to the appropriate device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate summary with the model
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=200, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Example text
example_text = "Insert your text here that you want to summarize."

# Generate the summary
summary = generate_summary(example_text, tokenizer, model, device)
print("Generated Summary:", summary)


Generated Summary: The Daily Discussion is a written version of each day's featured news stories. Use this weekly Newsquiz to test your knowledge of stories you saw on CNN.com. Today's Daily Discussion includes the weekly newsquiz. Click here to read your story. Back to the page you came from.


In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 71778
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 3342
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2872
    })
})

In [22]:
import random

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Load the dataset
dataset = load_dataset('ccdv/cnn_dailymail', '3.0.0')

def get_random_article(dataset, split='test'):
    # Randomly pick an article from the specified split
    random_index = random.randint(0, len(dataset[split]) - 1)
    article = dataset[split][random_index]['article']
    return article

def generate_summary(text, tokenizer, model, device):
    # Preprocess and tokenize the text
    text = clean_text(text)
    text = handle_special_content(text)
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")

    # Move tensors to the appropriate device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate summary with the model
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=200, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Pick a random article from the test split of the dataset
random_article = get_random_article(dataset)

# Assuming the model and device are already set up and loaded correctly
summary = generate_summary(random_article, tokenizer, model, device)
print("Generated Summary:", summary)


loading file https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json from cache at /home/azureuser/.cache/huggingface/transformers/4d8eeedc3498bc73a4b72411ebb3219209b305663632d77a6f16e60790b18038.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt from cache at /home/azureuser/.cache/huggingface/transformers/0ddddd3ca9e107b17a6901c92543692272af1c3238a8d7549fa937ba0057bbcf.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/facebook/bart-large-cnn/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/facebook/bart-large-cnn/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/facebook/bart-large-cnn/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/facebook/bart-large-cnn/resolve/main/tokenizer.json from cache at /home/

  0%|          | 0/3 [00:00<?, ?it/s]

Generated Summary: Jenson Button failed to complete qualifying session for the Bahrain Grand Prix. The British driver was set to start from the back of the grid. McLaren chairman Ron Dennis said the engine is not broken. Button took to Twitter to update his fans following yet another car failure. He then gave his verdict as the race unfolded tweeting live updates.
