In [1]:
import os
import re
import pandas as pd
import google.generativeai as genai
from tqdm import tqdm

In [2]:
api_key = os.getenv("API_KEY")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")

In [11]:
input_dir = os.path.join("2024_dataset", "results")
output_dir = os.path.join("2024_dataset", "results", "scored")
os.makedirs(output_dir, exist_ok=True)

def extract_similarity_score_and_rationale(response_text):
    match = re.search(r"\b([0-1](?:\.\d+)?)\b", response_text)
    score = float(match.group(1)) if match else None
    rationale = response_text.strip()
    return score, rationale

def get_similarity_score(original_caption, response, model, max_retries=3):
    prompt = f"""
    Answer the question based on the context.

    Context: I will give you two sentences. The first sentence will be an original caption/gold-standard answer. 
    The second sentence is the generated response. Please provide a similarity score between [0,1] inclusive of how closely the diagnoses in the two sentences precisely match, and also explain the reasoning behind the score.

    Original Caption: {original_caption}
    Response: {response}
    """

    for attempt in range(1, max_retries + 1):
        try:
            response = model.generate_content(prompt, generation_config={"temperature": 0})
            score, rationale = extract_similarity_score_and_rationale(response.text.strip())
            if score is not None:
                return score, rationale
            print(f"Attempt {attempt}/{max_retries} failed: No valid score found in response.")
        except Exception as e:
            print(f"Attempt {attempt}/{max_retries} failed: {e}")

        if attempt == max_retries:
            return None, None


In [12]:
datasets = ["processed_val_data.csv", "processed_test_data.csv"]

for dataset in datasets:
    input_file = os.path.join(input_dir, dataset)
    output_file = os.path.join(output_dir, f"scored_{dataset}")

    if not os.path.exists(input_file):
        print(f"Skipping {dataset}: File not found.")
        continue

    df = pd.read_csv(input_file)

    similarity_scores = []
    rationales = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Scoring {dataset}"):
        score, rationale = get_similarity_score(row["original_caption"], row["response"], model)
        similarity_scores.append(score)
        rationales.append(rationale)

    df["similarity_score"] = similarity_scores
    df["rationale"] = rationales

    df.to_csv(output_file, index=False)
    print(f"Saved results with scores and rationales to: {output_file}")

Scoring processed_val_data.csv: 100%|██████████| 34/34 [01:02<00:00,  1.85s/it]


Saved results with scores and rationales to: 2024_dataset\results\scored\scored_processed_val_data.csv


Scoring processed_test_data.csv: 100%|██████████| 64/64 [02:00<00:00,  1.88s/it]

Saved results with scores and rationales to: 2024_dataset\results\scored\scored_processed_test_data.csv





In [13]:
scored_dir = os.path.join("2024_dataset", "results", "scored")

datasets = ["scored_processed_val_data.csv", "scored_processed_test_data.csv"]

for dataset in datasets:
    file_path = os.path.join(scored_dir, dataset)
    
    if not os.path.exists(file_path):
        print(f"File not found: {dataset}")
        continue
    
    df = pd.read_csv(file_path)
    
    print(f"\nShowing first 3 rows of {dataset}:\n")
    
    for _, row in df.head(3).iterrows():
        print(f"Query: {row['query']}")
        print(f"Original Caption: {row['original_caption']}")
        print(f"Response: {row['response']}")
        print(f"Similarity Score: {row['similarity_score']}")
        print(f"Rationale: {row['rationale']}\n")
        print("-" * 80)


Showing first 3 rows of scored_processed_val_data.csv:

Query: Why do I have this raised horizontal skin/bump? on my forearm
Original Caption: Some group of people has a genetic predisposition to these lines. It is more common in the dominant arm. Research studies are limited on this condition but the more fat that you carry on your arms, the more likely creases are to form on your skin. If you don’t want this crease, then first lose some body fat to rule out that cause. If the crease still persists, then you’ll know that they’re a permanent fixture on your arms due to your genetics. If they’re not causing you any pain or if they don’t look too abnormal for your liking , then you don't need to worry about them
Response: Based on the image description, the raised horizontal skin bump you describe is likely not a bump at all, but rather a shallow scar or crease. The description mentions a "slightly depressed area," which contradicts the initial description of a raised bump.  Poor image 

In [14]:
scored_dir = os.path.join("2024_dataset", "results", "scored")

datasets = ["scored_processed_val_data.csv", "scored_processed_test_data.csv"]

for dataset in datasets:
    file_path = os.path.join(scored_dir, dataset)
    
    if not os.path.exists(file_path):
        print(f"File not found: {dataset}")
        continue
    
    df = pd.read_csv(file_path)
    
    df = df.dropna(subset=["similarity_score"])
    
    total_score = df["similarity_score"].sum()
    max_possible_score = len(df)
    average_score = total_score / max_possible_score if max_possible_score > 0 else 0

    print(f"\nDataset: {dataset}")
    print(f"Total Similarity Score: {total_score:.2f} out of {max_possible_score}")
    print(f"Average Similarity Score: {average_score:.4f}\n")


Dataset: scored_processed_val_data.csv
Total Similarity Score: 10.65 out of 34
Average Similarity Score: 0.3132


Dataset: scored_processed_test_data.csv
Total Similarity Score: 24.95 out of 64
Average Similarity Score: 0.3898

