In [None]:
!pip install bert-score
!pip install moverscore
!pip install pycocoevalcap
!pip install pyemd

In [None]:
import os
import gc
import nltk
import pandas as pd
import torch
from bert_score import score as bert_score

nltk.download('wordnet')

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

bert_f1_list = []
ref_texts = []
cand_texts = []

total_samples = 0
sum_bert_f1 = 0.0

chunksize = 1000  # Adjust chunksize based on available memory
csv_file = '/content/drive/MyDrive/updated_annotations.csv' # your CSV file which contains original and generated captions
count = 0

for chunk in pd.read_csv(csv_file, chunksize=chunksize):
    # Extract lists of texts for batch processing
    chunk_refs = chunk['title_multi_objects'].tolist()
    chunk_cands = chunk['generated_captions'].tolist()

    # Save texts for later DataFrame construction
    ref_texts.extend(chunk_refs)
    cand_texts.extend(chunk_cands)

    # Calculate BERTScore for the entire chunk using CUDA if available
    _, _, bert_f1_chunk = bert_score(
        chunk_cands,
        chunk_refs,
        lang="en",
        model_type="bert-base-uncased",
        device=device,
        batch_size=128  # Increase batch_size if memory permits
    )

    # Convert tensor to list and update cumulative values
    bert_f1_chunk = bert_f1_chunk.tolist()
    bert_f1_list.extend(bert_f1_chunk)
    sum_bert_f1 += sum(bert_f1_chunk)
    total_samples += len(bert_f1_chunk)

    # Clean up memory for the processed chunk
    del chunk
    gc.collect()

    print("Processed chunk:", count)
    count += 1

# Create a DataFrame from the accumulated lists
result_df = pd.DataFrame({
    'title_multi_objects': ref_texts,
    'generated_captions': cand_texts,
    'BERT-F1': bert_f1_list
})

print(result_df[['title_multi_objects', 'generated_captions', 'BERT-F1']].head())

# Calculate average BERT-F1 value
avg_bert_f1 = sum_bert_f1 / total_samples
print("Dataset Average Values:")
print(f"  BERT-F1 : {avg_bert_f1:.4f}")

In [None]:
import os
import gc
import nltk
import pandas as pd
import torch
import random
from transformers import CLIPProcessor, CLIPModel

nltk.download('wordnet')

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

ref_texts = []
cand_texts = []
chunksize = 1000
csv_file = '/content/drive/MyDrive/updated_annotations.csv' # your CSV file which contains original and generated captions
count = 0

for chunk in pd.read_csv(csv_file, chunksize=chunksize):
    ref_texts.extend(chunk['title_multi_objects'].tolist())
    cand_texts.extend(chunk['generated_captions'].tolist())
    del chunk
    gc.collect()
    print("Processed chunk:", count)
    count += 1

# filter out empty or NaN values
filtered_refs = []
filtered_cands = []
for ref, cand in zip(ref_texts, cand_texts):
    if pd.notna(ref) and pd.notna(cand) and str(ref).strip() and str(cand).strip():
        filtered_refs.append(str(ref).strip())
        filtered_cands.append(str(cand).strip())

print("Number of valid pairs:", len(filtered_refs))

In [None]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_processor.tokenizer.model_max_length = 128

new_max_length = 128
old_max_length = clip_model.config.text_config.max_position_embeddings  
hidden_size = clip_model.text_model.embeddings.position_embedding.weight.shape[1]
old_pos_embed = clip_model.text_model.embeddings.position_embedding.weight.data
if new_max_length > old_max_length:
    print(f"Extending positional embeddings from {old_max_length} to {new_max_length}")
    new_pos_embed = torch.zeros(new_max_length, hidden_size, device=old_pos_embed.device)
    new_pos_embed[:old_max_length, :] = old_pos_embed
    new_pos_embed[old_max_length:, :] = old_pos_embed[-1, :].unsqueeze(0).expand(new_max_length - old_max_length, hidden_size)
    new_embedding = torch.nn.Embedding(new_max_length, hidden_size).to(device)
    new_embedding.weight.data = new_pos_embed
    clip_model.text_model.embeddings.position_embedding = new_embedding
    clip_model.config.text_config.max_position_embeddings = new_max_length

batch_size = 32
clip_scores = []
num_batches = (len(filtered_refs) + batch_size - 1) // batch_size

for i in range(0, len(filtered_refs), batch_size):
    batch_refs = filtered_refs[i:i+batch_size]
    batch_cands = filtered_cands[i:i+batch_size]
    texts = []
    for ref, cand in zip(batch_refs, batch_cands):
        texts.append(ref)
        texts.append(cand)
    inputs = clip_processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    input_ids = inputs["input_ids"]
    position_ids = torch.arange(input_ids.shape[1], dtype=torch.long, device=device).unsqueeze(0).expand_as(input_ids)
    inputs["position_ids"] = position_ids
    with torch.no_grad():
        text_features = clip_model.get_text_features(**inputs)

    for j in range(0, text_features.shape[0], 2):
        score = torch.nn.functional.cosine_similarity(text_features[j:j+1], text_features[j+1:j+2]).item()
        clip_scores.append(score)
    print(f"Processed batch {i//batch_size + 1}/{num_batches}")

avg_clip_score = sum(clip_scores) / len(clip_scores)
print("Average CLIPScore: {:.4f}".format(avg_clip_score))