# BART and Whisper for Audio Summarization

## A. Load BART

In [2]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')


In [3]:
import pandas as pd
import os

datasets = '../data/dataset'
files_list = os.listdir(datasets)

print(files_list)

['dataset_20240704_150822.csv', 'dataset_20240704_151424.csv', 'dataset_20240704_151901.csv', 'dataset_20240704_152357.csv', 'dataset_20240704_152812.csv', 'dataset_20240704_154639.csv', 'dataset_20240704_155436.csv', 'dataset_20240704_160014.csv', 'dataset_20240704_160759.csv', 'dataset_20240704_161256.csv', 'dataset_20240704_162030.csv', 'dataset_20240704_164120.csv', 'dataset_20240704_165808.csv', 'dataset_20240704_172021.csv', 'dataset_20240704_173948.csv', 'dataset_20240704_175152.csv', 'dataset_20240704_180117.csv', 'dataset_20240704_181213.csv', 'dataset_20240704_182558.csv', 'dataset_20240704_184053.csv', 'dataset_20240704_185236.csv', 'dataset_20240704_190449.csv', 'dataset_20240704_191929.csv', 'dataset_20240704_192809.csv', 'dataset_20240704_193055.csv', 'dataset_20240704_193834.csv', 'dataset_20240704_194959.csv', 'dataset_20240704_200424.csv', 'dataset_20240704_201639.csv', 'dataset_20240704_202421.csv', 'dataset_20240704_203553.csv', 'dataset_20240704_203820.csv', 'datase

In [4]:
data = pd.DataFrame()

for file in files_list:
    filepath = os.path.join(datasets, file)
    df = pd.read_csv(filepath)
    data = pd.concat([data, df], ignore_index=True)

data.insert(0,'id','')
data['id'] = data.audio_path.str[-10:]
data.head(5)

Unnamed: 0,id,audio_path,transcript,summary
0,000000.mp3,../data/audio\000000.mp3,"\n\tON the north-east coast of Scotland, in th...",\n \n\tThe history of the family of the...
1,000001.mp3,../data/audio\000001.mp3,\n\tALLEYN was no where to be found. The Earl ...,\n \n\tThere is an attack and an impend...
2,000002.mp3,../data/audio\000002.mp3,\n\tTHE Count was walking on the ramparts of t...,\n \n\tMalcolm reveals an important sec...
3,000003.mp3,../data/audio\000003.mp3,\n\tMEANWHILE the Earl remained a solitary pri...,\n \n\tMatilda falls into despair over ...
4,000004.mp3,../data/audio\000004.mp3,"\n\tMARY, in the mean time, suffered all the t...",\n \n\tIdentities are revealed and the ...


In [5]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(data)

def preprocess_function(examples):
    inputs = [
        f"Summarize the following conversation.\n\n### Input:\n{transcript}\n\n### Summary:\n"
        for transcript in examples['transcript']
    ]
    
    targets = examples['summary']
    model_inputs = tokenizer(inputs, padding='max_length', max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

Map:   0%|          | 0/390 [00:00<?, ? examples/s]



In [8]:
example = tokenized_dataset[0]
input_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
summary_text = tokenizer.decode(example['labels'], skip_special_tokens=True)

print("Training Prompt Example:")
print("Input Text:\n", input_text)

Training Prompt Example:
Input Text:
 Summarize the following conversation.

### Input:

	ON the north-east coast of Scotland, in the most romantic part of the Highlands, stood the Castle of Athlin; an edifice built on the summit of a rock whose base was in the sea. This pile was venerable from its antiquity, and from its Gothic structure; but more venerable from the virtues which it enclosed. It was the residence of the still beautiful widow, and the children of the noble Earl of Athlin, who was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful; and still residing in all the pomp of feudal greatness, within a few miles of the castle of Athlin. Encroachment on the domain of Athlin, was the occasion of the animosity which subsisted between the chiefs. Frequent broils had happened between their clans, in which that of Athlin had generally been victorious. Malcolm, whose pride was touched by the defeat of his people; whose ambition was curbed by the authori

In [9]:
import torch

def generate_summary(example):
    input_ids = example['input_ids']
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    generated_ids = model.generate(input_ids, num_beams=4, max_length=150, early_stopping=True)
    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_summary

In [10]:
example = tokenized_dataset[0]

generated_summary = generate_summary(example)
print("Generated Summary:\n", generated_summary)

Generated Summary:
 The story is based on the life of Matilda, the widow of the Earl of Athlin. The Earl was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful. Matilda had withdrawn from the public eye, into this ancient seat of feudal government. She devoted herself to the education of her children.


In [11]:
dash_line = "----"*15

print(dash_line)
print("HUMAN SUMMARY:")
print(data['summary'][0])

print(dash_line)
print("MODEL SUMMARY:")
print(generated_summary)

------------------------------------------------------------
HUMAN SUMMARY:

        
	The history of the family of the Earl of Athlin and the ongoing feud with Malcolm is introduced.      
------------------------------------------------------------
MODEL SUMMARY:
The story is based on the life of Matilda, the widow of the Earl of Athlin. The Earl was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful. Matilda had withdrawn from the public eye, into this ancient seat of feudal government. She devoted herself to the education of her children.


In [12]:
import evaluate 

rouge = evaluate.load('rouge')

In [13]:
summary = data['summary'][0]

if len(generated_summary) > len(summary):
    generated_summary = generated_summary[:len(summary)]
else:
    summary = summary[:len(generated_summary)]

model_results = rouge.compute(
    predictions=generated_summary,
    references=summary,
    use_aggregator=True,
    use_stemmer=True,
)

print("----"*20)
print("BART MODEL:")
print(model_results)

--------------------------------------------------------------------------------
BART MODEL:
{'rouge1': 0.04424778761061947, 'rouge2': 0.0, 'rougeL': 0.04424778761061947, 'rougeLsum': 0.04424778761061947}


In [14]:
generated_summaries = []

for example in tokenized_dataset:
    generated_summary = generate_summary(example)
    generated_summaries.append(generated_summary)


KeyboardInterrupt: 

In [15]:
for idx, summary in enumerate(generated_summaries[:5]):
    print(f"Generated Summary {idx + 1}:")
    print(summary)
    print()

output_file = "generated_summaries.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for summary in generated_summaries:
        f.write(summary + "\n")

Generated Summary 1:
The story is based on the life of Matilda, the widow of the Earl of Athlin. The Earl was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful. Matilda had withdrawn from the public eye, into this ancient seat of feudal government. She devoted herself to the education of her children.

Generated Summary 2:
The Earl was on his way to the castle when he saw two people cross the platform. He called to them, but at the sound of his voice they quickened their pace and disappeared in the darkness of the ramparts. After he had stood some time surveying the rampart, he heard the low restrained voice of a person unknown, but the distance prevented his distinguishing the subject of the conversation. He drew his sword, and watched in silence their motions. They continued to advance, till, suddenly stopping, they turned, and took a long survey of the fabric.

Generated Summary 3:
The Count was walking on the ramparts of the castle, involved in thoug

## WHISPER

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import whisper

def get_audio_transcript(directory, file_name):
    model = whisper.load_model("base")
    audio = os.path.join(directory, file_name)
    result = model.transcribe(audio)

    transcript = result["text"]

    return transcript

def tokenize_transcript(transcript):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    inputs = tokenizer("Summarize the following conversation.\n\n### Input:\n" + transcript + "\n\n### Summary:\n", return_tensors="pt", max_length=1024, truncation=True)
    
    return inputs

def generate_summary(inputs):
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
    generated_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=150, early_stopping=True)
    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    return generated_summary

def process_audio_and_summarize(directory, file_name):

    transcript = get_audio_transcript(directory, file_name)

    inputs = tokenize_transcript(transcript)

    generated_summary = generate_summary(inputs)

    return generated_summary