In [4]:
import pandas as pd

# Load the CSV file
data = pd.read_csv('test_videodata_description.csv')

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Load the tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained('./T5_Trained')
t5_model = T5ForConditionalGeneration.from_pretrained('./T5_Trained').to('cuda')

# Define a custom Dataset class
class VideoDescriptionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        inputs = self.tokenizer(
            row['generated_description'], max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt'
        )
        return {key: val.squeeze(0) for key, val in inputs.items()}

# Initialize dataset and dataloader
t5_dataset = VideoDescriptionDataset(data, t5_tokenizer)
t5_dataloader = DataLoader(t5_dataset, batch_size=4, shuffle=False)

# Generate summaries
t5_model.eval()
t5_summaries = []

for batch in tqdm(t5_dataloader, desc="Generating T5 Summaries"):
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    with torch.no_grad():
        outputs = t5_model.generate(input_ids, attention_mask=attention_mask)
    t5_summaries.extend(t5_tokenizer.batch_decode(outputs, skip_special_tokens=True))

# Add the summaries to the dataframe
data['t5_summary'] = t5_summaries

Generating T5 Summaries: 100%|██████████| 25/25 [00:13<00:00,  1.86it/s]


In [11]:
import torch
import gc

def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()
import gc

def clear_ram_memory():
    gc.collect()

In [12]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
from tqdm import tqdm

# Load the tokenizer and model
bart_tokenizer = BartTokenizer.from_pretrained('./saved_model')
bart_model = BartForConditionalGeneration.from_pretrained('./saved_model').to('cuda')

# Define the generate_summary function with batch processing, mixed precision, and clearing cache
def generate_summary(texts, batch_size=2):
    bart_model.eval()
    summaries = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Summaries"):
        batch_texts = texts[i:i + batch_size]
        inputs = bart_tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to('cuda')
        torch.cuda.empty_cache()  # Clear the cache
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                outputs = bart_model.generate(**inputs)
        batch_summaries = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
        summaries.extend(batch_summaries)
    return summaries

# Generate summaries in smaller batches with mixed precision and cache clearing
bart_summaries = generate_summary(data['generated_description'].tolist(), batch_size=2)

# Add the summaries to the dataframe
data['bart_summary'] = bart_summaries

# Save the results to a new CSV file
data.to_csv('summarized_videodata.csv', index=False)

Generating Summaries: 100%|██████████| 50/50 [00:27<00:00,  1.79it/s]


In [15]:
# Remove the "generated_description" column
data = data.drop(columns=['generated_description'])

# Save the results to a new CSV file
data.to_csv('model_comparison.csv', index=False)

In [16]:
data

Unnamed: 0,video_id,description,t5_summary,bart_summary
0,1006792915,Process of serving margarita cocktail. bartend...,Close up of a glass of vodka.,Close-up shot of bartender pouring cocktail in...
1,1007017096,Dolly shot of cook at the kitchen cutting mush...,Cute sourdough sourdough on a wooden table,Close-up shot of chef cutting mushrooms on a w...
2,1007829319,Small boy feeding pigeons on the street,Little boy in red jacket and red pants is feed...,Little boy feeding pigeons in the park
3,1008414787,Aerial view beachfront destination usa picnic ...,Aerial view of a small island in the mediterra...,Aerial view of a small island in the middle of...
4,1010428679,"Aerial view dubrovnik old town in dalmatia, cr...","Aerial view of kosteli town, croatia","Aerial view of the old town of kosteler, greece."
...,...,...,...,...
95,8096140,Moose in autumn - sweden - rutting season,White moose in the forest,Mountain moose in the forest
96,8800768,Hong kong - circa october 2014: day light city...,"Hong hong kong - june 15, 2019: hong kong city...","Hong kong, thailand - september 16, 2018: hong"
97,9138830,Circa 1960s - the japanese navy is welcomed to...,Aerial view of a ship docking in a port,Circa 1940s - the first world war was held in ...
98,9256856,"San pablo city, laguna, philippines - january ...",Aerial view of people dancing in a red and yel...,"Shanghai, china - september 16, 2018: people i..."


#### Model Scoring

In [17]:
# Set display options to show all content without clipping
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [18]:
data

Unnamed: 0,video_id,description,t5_summary,bart_summary
0,1006792915,Process of serving margarita cocktail. bartender pouring alcohol drink into a glass close-up.,Close up of a glass of vodka.,Close-up shot of bartender pouring cocktail into glass
1,1007017096,Dolly shot of cook at the kitchen cutting mushrooms,Cute sourdough sourdough on a wooden table,Close-up shot of chef cutting mushrooms on a wooden cutting board
2,1007829319,Small boy feeding pigeons on the street,Little boy in red jacket and red pants is feeding pigeons in the park.,Little boy feeding pigeons in the park
3,1008414787,"Aerial view beachfront destination usa picnic island blvd, tampa, fl 33616. indian rocks beach is a city in pinellas county, florida, united states.",Aerial view of a small island in the mediterranean sea.,Aerial view of a small island in the middle of the sea
4,1010428679,"Aerial view dubrovnik old town in dalmatia, croatia - prominent travel destination of croatia. dubrovnik old town was listed as unesco world heritage sites in 1979.","Aerial view of kosteli town, croatia","Aerial view of the old town of kosteler, greece."
5,1011212219,Bat-eared fox resting on ground.,Kangaroo with white eyes and brown fur,Kangaroo in the wild
6,1011773219,"Side view of a giant sci-fi interplanetary spaceship flying on neptune background, 3d animation. texture of planet was created in graphic editor without photos.",Spacecraft flying over a planet,Space station in space
7,1012764110,Bartender is stirring cocktails on the bar,Bartender making a cocktail at the bar.,Bartender pouring a cocktail in a glass
8,1013564174,Business people working hd animation,Close up of young young man talking to classmates in a classroom,Businessman talking to a woman at the office
9,1017093517,Felodipine - male doctor with mobile phone opens and touches hologram active ingrident of medicine,Doctor using a smartphone and a tablet.,Doctor using a tablet in a hospital


BLEU Score

The BLEU (Bilingual Evaluation Understudy) score measures the quality of machine-generated translations or text summaries by comparing n-grams (word sequences) in the generated text to those in reference texts, with higher scores indicating greater similarity and relevance.

In [19]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Initialize lists to store BLEU scores
t5_bleu_scores = []
bart_bleu_scores = []

# Iterate through each row in the DataFrame
for index, row in data.iterrows():
    reference = row['description'].split()  # Tokenize the reference description
    t5_summary = row['t5_summary'].split()  # Tokenize the T5 summary
    bart_summary = row['bart_summary'].split()  # Tokenize the BART summary

    # Calculate BLEU score for T5 summary
    t5_bleu = sentence_bleu([reference], t5_summary)
    t5_bleu_scores.append(t5_bleu)

    # Calculate BLEU score for BART summary
    bart_bleu = sentence_bleu([reference], bart_summary)
    bart_bleu_scores.append(bart_bleu)

# Add BLEU scores to the DataFrame
data['t5_bleu'] = t5_bleu_scores
data['bart_bleu'] = bart_bleu_scores

# Calculate average BLEU scores
average_t5_bleu = sum(t5_bleu_scores) / len(t5_bleu_scores)
average_bart_bleu = sum(bart_bleu_scores) / len(bart_bleu_scores)

# Print average BLEU scores
print(f'Average BLEU score for T5 model: {average_t5_bleu:.4f}')
print(f'Average BLEU score for BART model: {average_bart_bleu:.4f}')

# Save the DataFrame with BLEU scores to a new CSV file
output_file_path = 'output_with_bleu_scores.csv'
data.to_csv(output_file_path, index=False)

print("BLEU scores calculated and saved to", output_file_path)

Average BLEU score for T5 model: 0.0068
Average BLEU score for BART model: 0.0000
BLEU scores calculated and saved to output_with_bleu_scores.csv


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


ROUGE Score

ROUGE scores evaluate the quality of generated summaries by measuring the overlap of n-grams (words or sequences of words) between the generated and reference summaries, with ROUGE-1 focusing on unigrams, ROUGE-2 on bigrams, and ROUGE-3 on trigrams.

In [20]:
import pandas as pd
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize lists to store ROUGE scores
t5_rouge_scores = []
bart_rouge_scores = []

# Iterate through each row in the DataFrame
for index, row in data.iterrows():
    reference = row['description']  # Reference description
    t5_summary = row['t5_summary']  # T5 summary
    bart_summary = row['bart_summary']  # BART summary

    # Calculate ROUGE score for T5 summary
    t5_rouge = scorer.score(reference, t5_summary)
    t5_rouge_scores.append(t5_rouge)

    # Calculate ROUGE score for BART summary
    bart_rouge = scorer.score(reference, bart_summary)
    bart_rouge_scores.append(bart_rouge)

# Calculate average ROUGE scores
def average_rouge_scores(rouge_scores, metric):
    precision = sum(score[metric].precision for score in rouge_scores) / len(rouge_scores)
    recall = sum(score[metric].recall for score in rouge_scores) / len(rouge_scores)
    fmeasure = sum(score[metric].fmeasure for score in rouge_scores) / len(rouge_scores)
    return precision, recall, fmeasure

average_t5_rouge1 = average_rouge_scores(t5_rouge_scores, 'rouge1')
average_t5_rouge2 = average_rouge_scores(t5_rouge_scores, 'rouge2')
average_t5_rougeL = average_rouge_scores(t5_rouge_scores, 'rougeL')

average_bart_rouge1 = average_rouge_scores(bart_rouge_scores, 'rouge1')
average_bart_rouge2 = average_rouge_scores(bart_rouge_scores, 'rouge2')
average_bart_rougeL = average_rouge_scores(bart_rouge_scores, 'rougeL')

# Print average ROUGE scores
print('Average ROUGE scores for T5 model:')
print(f'ROUGE-1: Precision={average_t5_rouge1[0]:.4f}, Recall={average_t5_rouge1[1]:.4f}, F-measure={average_t5_rouge1[2]:.4f}')
print(f'ROUGE-2: Precision={average_t5_rouge2[0]:.4f}, Recall={average_t5_rouge2[1]:.4f}, F-measure={average_t5_rouge2[2]:.4f}')
print(f'ROUGE-L: Precision={average_t5_rougeL[0]:.4f}, Recall={average_t5_rougeL[1]:.4f}, F-measure={average_t5_rougeL[2]:.4f}')

print('Average ROUGE scores for BART model:')
print(f'ROUGE-1: Precision={average_bart_rouge1[0]:.4f}, Recall={average_bart_rouge1[1]:.4f}, F-measure={average_bart_rouge1[2]:.4f}')
print(f'ROUGE-2: Precision={average_bart_rouge2[0]:.4f}, Recall={average_bart_rouge2[1]:.4f}, F-measure={average_bart_rouge2[2]:.4f}')
print(f'ROUGE-L: Precision={average_bart_rougeL[0]:.4f}, Recall={average_bart_rougeL[1]:.4f}, F-measure={average_bart_rougeL[2]:.4f}')


Average ROUGE scores for T5 model:
ROUGE-1: Precision=0.3132, Recall=0.1964, F-measure=0.2234
ROUGE-2: Precision=0.0871, Recall=0.0589, F-measure=0.0644
ROUGE-L: Precision=0.2712, Recall=0.1722, F-measure=0.1944
Average ROUGE scores for BART model:
ROUGE-1: Precision=0.3093, Recall=0.2086, F-measure=0.2300
ROUGE-2: Precision=0.0728, Recall=0.0528, F-measure=0.0557
ROUGE-L: Precision=0.2678, Recall=0.1840, F-measure=0.2002


In [22]:
import pandas as pd
from bert_score import score

# Convert DataFrame columns to lists
references = data['description'].tolist()
t5_summaries = data['t5_summary'].tolist()
bart_summaries = data['bart_summary'].tolist()

# Calculate BERTScore for T5 summaries
P_t5, R_t5, F1_t5 = score(t5_summaries, references, lang='en', verbose=True)

# Calculate BERTScore for BART summaries
P_bart, R_bart, F1_bart = score(bart_summaries, references, lang='en', verbose=True)

# Print average scores
print("T5 Summaries BERTScore:")
print("Precision:", P_t5.mean().item())
print("Recall:", R_t5.mean().item())
print("F1 Score:", F1_t5.mean().item())

print("\nBART Summaries BERTScore:")
print("Precision:", P_bart.mean().item())
print("Recall:", R_bart.mean().item())
print("F1 Score:", F1_bart.mean().item())




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 10.74 seconds, 9.31 sentences/sec


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 12.77 seconds, 7.83 sentences/sec
T5 Summaries BERTScore:
Precision: 0.8828864097595215
Recall: 0.8506925106048584
F1 Score: 0.8661977648735046

BART Summaries BERTScore:
Precision: 0.8861337900161743
Recall: 0.8550518155097961
F1 Score: 0.8699721693992615
