In [None]:
file_path = 'Large_Model_Questions_WITH_BERTSCORE_PRF_debertaxlargemnli.xlsx'

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

In [None]:
# Load gtr-t5-large model ONCE
model = SentenceTransformer('sentence-transformers/gtr-t5-large')

# Get all sheet names
xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names

temp_path = 'temp_all_sheets.xlsx'
output_path = "grouped_output_merged_ALLSHEETS_WITH_BERTSCORE_PRF_debertaxlargemnli.xlsx"

In [None]:
# -----------------------------
# Part 1: process + write all sheets
# -----------------------------
with pd.ExcelWriter(temp_path, engine="openpyxl") as writer:
    for sheet_name in sheet_names:
        # Load your data
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        df.columns = df.columns.str.strip()
        df['Prompt'] = df['Prompt'].ffill()

        # Generate embeddings
        prompt_embeddings = model.encode(df['Prompt'].tolist(), convert_to_tensor=True)
        question_embeddings = model.encode(df['Questions'].tolist(), convert_to_tensor=True)

        # Compute cosine similarity
        cosine_similarities = util.cos_sim(prompt_embeddings, question_embeddings).diagonal().cpu().numpy()
        df['similarity'] = cosine_similarities.round(2) 

        # Group by model
        avg_similarity = df.groupby('Model')['similarity'].mean().sort_values(ascending=False)
        print(f"\n=== {sheet_name} ===")
        print(avg_similarity)

        df_ps1 = df.copy()
        df_ps1.insert(3, "Bloom's level", ([1,2,3,4,5,6] * ((len(df) // 6) + 1))[:len(df)])


        # Generate a group ID for each block of 6 consecutive rows per model
        df_ps1['group_id'] = df_ps1.groupby('Model').cumcount() // 6

        # Compute averages per model per prompt group
        df_ps1['avg_similarity_per_prompt'] = df_ps1.groupby(['group_id', 'Model'])['similarity'].transform('mean')
        df_ps1['Avg Grade Level'] = df_ps1.groupby(['group_id', 'Model'])['Grade level'].transform('mean')
        df_ps1['Avg Reading Ease'] = df_ps1.groupby(['group_id', 'Model'])['Reading Ease'].transform('mean')

        df_ps1.drop(columns=['group_id'], inplace=True)

        # Write processed sheet
        df_ps1.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
# -----------------------------
# Part 2: reopen and merge cells in EACH sheet
# -----------------------------
wb = load_workbook(temp_path)

# Define which columns to merge and how many rows per group
columns_to_merge = [
    ('Prompt', 48),
    ('Model', 6),
    ('avg_similarity_per_prompt', 6),
    ('Avg Grade Level', 6),
    ('Avg Reading Ease', 6),
]

for sheet_name in wb.sheetnames:
    ws = wb[sheet_name]

    # Map column names to Excel column letters
    header = {cell.value: idx + 1 for idx, cell in enumerate(ws[1])}

    for col_name, group_size in columns_to_merge:
        if col_name not in header:
            continue  # skip if column missing in that sheet

        col_idx = header[col_name]
        col_letter = get_column_letter(col_idx)

        for start_row in range(2, ws.max_row + 1, group_size):  # Start from row 2 (skip header)
            end_row = min(start_row + group_size - 1, ws.max_row)
            if start_row != end_row:  # Only merge if more than 1 row
                ws.merge_cells(f"{col_letter}{start_row}:{col_letter}{end_row}")

wb.save(output_path)
print(f"\nSaved merged workbook: {output_path}")