In [None]:
import pandas as pd
import re

# =============================
# File paths
# =============================
excel_file = "Questions_generated.xlsx"
csv_file = "Model_Responses_NewModels.csv"
output_file = "Questions_generated_UPDATED.xlsx"

# =============================
# Constants
# =============================
ROWS_PER_MODEL = 6
MODELS_PER_PROMPT_EXCEL = 7
ROWS_PER_PROMPT_EXCEL = ROWS_PER_MODEL * MODELS_PER_PROMPT_EXCEL

CSV_MODEL_ORDER = [
    "gpt-oss:latest",
    "granite4:latest",
    "mistral-small3.2:latest",
]

EXCEL_REPLACEMENTS = {
    "granite3.2:2b": "granite4:latest",
    "mistral-small:22b": "mistral-small3.2:latest",
}

# =============================
# Cleaning functions
# =============================

def fix_encoding(text):
    """Fix garbled CSV text caused by UTF-8 misread as Latin-1."""
    try:
        return text.encode("latin1").decode("utf-8")
    except (UnicodeEncodeError, UnicodeDecodeError, AttributeError):
        return text

def clean_text(text: str) -> str:
    """Fix garbled characters, preserve quotes/apostrophes, and fix spacing properly."""
    if pd.isna(text):
        return text

    # Step 1: known replacements
    replacements = {
        "‚Äú": "“",
        "‚Äù": "”",
        "‚Äì": "–",
        "â€“": "–",
        "â€”": "—",
        "â€": "\"",
        "Äô": "’",
        "Äò": "‘",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Step 2: replace remaining junk with space
    allowed_chars = r"A-Za-z0-9\s.,:;!?()\"'\-–—\u00A0-\u00FF\u2018\u2019\u201C\u201D"
    text = re.sub(f"[^{allowed_chars}]+", " ", text)

    # Step 3: collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Step 4: remove space before apostrophes
    text = re.sub(r"\s+’", "’", text)
    text = re.sub(r"\s+‘", "‘", text)

    # Step 5: add space after punctuation only if not immediately after a quote
    text = re.sub(r'(?<!["“”])([:;,.!?])([A-Za-z“])', r'\1 \2', text)

    return text

# =============================
# Load CSV and clean
# =============================
csv_df = pd.read_csv(csv_file, encoding="latin1")
csv_df.columns = csv_df.columns.str.strip().str.lower()

if "questions" in csv_df.columns:
    QUESTION_COL = "questions"
elif "question" in csv_df.columns:
    QUESTION_COL = "question"
else:
    raise ValueError(f"No question column found in CSV: {csv_df.columns.tolist()}")

# Use old method for CSV
csv_df[QUESTION_COL] = csv_df[QUESTION_COL].apply(fix_encoding)
csv_pointer = 0  # global pointer

# =============================
# Load Excel and clean
# =============================
excel_sheets = pd.read_excel(excel_file, sheet_name=None)
updated_sheets = {}

for sheet_name, sheet_df in excel_sheets.items():
    sheet_df.columns = sheet_df.columns.str.strip().str.lower()

    required = {"prompt", "model", QUESTION_COL}
    if not required.issubset(sheet_df.columns):
        raise ValueError(
            f"Sheet {sheet_name} missing columns. Found: {sheet_df.columns.tolist()}"
        )

    # Clean Excel questions
    sheet_df[QUESTION_COL] = sheet_df[QUESTION_COL].apply(clean_text)

    sheet_df = sheet_df.reset_index(drop=True)
    updated_rows = []

    num_prompts = len(sheet_df) // ROWS_PER_PROMPT_EXCEL

    for p in range(num_prompts):
        excel_start = p * ROWS_PER_PROMPT_EXCEL
        excel_end = excel_start + ROWS_PER_PROMPT_EXCEL
        prompt_block = sheet_df.iloc[excel_start:excel_end].copy()

        prompt_value = prompt_block.iloc[0]["prompt"]

        # -----------------------------
        # Pull CSV rows for this prompt
        # -----------------------------
        csv_prompt_block = csv_df.iloc[
            csv_pointer : csv_pointer + ROWS_PER_MODEL * len(CSV_MODEL_ORDER)
        ].copy()
        csv_pointer += ROWS_PER_MODEL * len(CSV_MODEL_ORDER)

        csv_blocks = {
            model: csv_prompt_block.iloc[
                i * ROWS_PER_MODEL : (i + 1) * ROWS_PER_MODEL
            ][QUESTION_COL].values
            for i, model in enumerate(CSV_MODEL_ORDER)
        }

        # -----------------------------
        # Walk Excel models IN ORDER
        # -----------------------------
        i = 0
        while i < len(prompt_block):
            model = prompt_block.iloc[i]["model"]
            model_rows = prompt_block.iloc[i : i + ROWS_PER_MODEL].copy()

            model_rows["prompt"] = prompt_value

            if model in EXCEL_REPLACEMENTS:
                new_model = EXCEL_REPLACEMENTS[model]
                model_rows["model"] = new_model
                model_rows[QUESTION_COL] = csv_blocks[new_model]
            # else: Excel questions already cleaned

            updated_rows.append(model_rows)
            i += ROWS_PER_MODEL

        # -----------------------------
        # Append gpt-oss:latest LAST
        # -----------------------------
        gpt_rows = pd.DataFrame({
            "prompt": [prompt_value] * ROWS_PER_MODEL,
            "model": ["gpt-oss:latest"] * ROWS_PER_MODEL,
            QUESTION_COL: csv_blocks["gpt-oss:latest"]
        })

        updated_rows.append(gpt_rows)

    # -----------------------------
    # Finalize sheet
    # -----------------------------
    final_sheet = pd.concat(updated_rows, ignore_index=True)
    final_sheet["index"] = range(1, len(final_sheet) + 1)

    final_sheet = final_sheet.rename(columns={
        "index": "Index",
        "prompt": "Prompt",
        "model": "Model",
        QUESTION_COL: "Questions",
    })

    final_sheet = final_sheet[["Index", "Prompt", "Model", "Questions"]]
    updated_sheets[sheet_name] = final_sheet

# =============================
# Write output
# =============================
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    for sheet_name, df in updated_sheets.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"DONE — Excel and CSV cleaned, merged output written to: {output_file}")


In [None]:
from openpyxl import load_workbook

ROWS_PER_PROMPT_FINAL = 48  # 8 models × 6 rows

wb = load_workbook("Questions_generated_UPDATED.xlsx")

for ws in wb.worksheets:
    max_row = ws.max_row

    # Prompt column is column B (Index=A, Prompt=B)
    prompt_col = 2

    row = 2  # skip header
    while row <= max_row:
        start_row = row
        end_row = min(row + ROWS_PER_PROMPT_FINAL - 1, max_row)

        ws.merge_cells(
            start_row=start_row,
            start_column=prompt_col,
            end_row=end_row,
            end_column=prompt_col,
        )

        row += ROWS_PER_PROMPT_FINAL

wb.save("Questions_generated_UPDATED.xlsx")
print("Prompt cells merged successfully")
