In [1]:
import os
import torch
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import textstat

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ----------------------------
# PREPROCESSING FUNCTIONS
# ----------------------------
def preprocess_texts(texts):
    return [" ".join(t.strip().split()) for t in texts]

In [14]:
# ----------------------------
# CONFIGURATION & MODELS
# ----------------------------
GRAMMAR_MODELS = [
    "google/flan-t5-small",
    "facebook/bart-base"
]
STYLE_MODELS = [
    # "google/flan-t5-base",
    "sshleifer/distilbart-cnn-12-6",
    "rajistics/informal_formal_style_transfer"
]

DEVICE = 0 if torch.cuda.is_available() else -1
BATCH_SIZE = 8
PIPELINE_MAX_LENGTH = 256
DATASET_SUBSET_LENGTH = 300

In [4]:
# ----------------------------
# LOAD STYLE DATASET (WikiAuto-Manual)
# ----------------------------
dataset = load_dataset("chaojiang06/wiki_auto", "manual")

# Randomly sample 100 items from the test split for evaluation
test_split_full = dataset["test"]
test_split = test_split_full.shuffle(seed=42).select(range(min(DATASET_SUBSET_LENGTH, len(test_split_full))))

if 'normal_sentence' in test_split.column_names and 'simple_sentence' in test_split.column_names:
    sources = test_split['normal_sentence']
    references = [[r] for r in test_split['simple_sentence']]
else:
    raise KeyError(f"Expected 'normal_sentence' and 'simple_sentence' in dataset columns: {test_split.column_names}")

In [20]:
len(sources)

300

In [15]:
# ----------------------------
# BUILD INFERENCE PIPELINES
# ----------------------------
# grammar_pipes = {m: pipeline("text2text-generation", model=m, device=DEVICE) for m in GRAMMAR_MODELS}
# style_pipes   = {m: pipeline("text2text-generation", model=m, device=DEVICE) for m in STYLE_MODELS}

grammar_pipes = {
    m: pipeline(
        "text2text-generation",
        model=m,
        device=DEVICE,
        do_sample=False,
        num_beams=4,
        max_length=PIPELINE_MAX_LENGTH
    )
    for m in GRAMMAR_MODELS
}

style_pipes = {
    m: pipeline(
        "text2text-generation",
        model=m,
        device=DEVICE,
        do_sample=True,
        temperature=0.9,
        top_p=0.8,
        num_beams=4,
        max_length=PIPELINE_MAX_LENGTH / 2
    )
    for m in STYLE_MODELS
}

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0


In [19]:
# ----------------------------
# LOAD METRICS
# ----------------------------
bleu       = evaluate.load("bleu")
bertscore  = evaluate.load("bertscore")
perplexity = evaluate.load("perplexity", module_type="metric")
sari       = evaluate.load("sari")

In [25]:
def run_pipe_with_prefix(
    pipe,
    texts: list[str],
    model_name: str,
    is_grammar: bool,
    batch_size: int = BATCH_SIZE,
    max_len: int = PIPELINE_MAX_LENGTH,
):
    
    # Determine model family → choose prefixes
    t5 = "t5" in model_name.lower()
    if t5:
        gram_pref  = "Fix all grammar and spelling errors> "
        style_pref = "Paraphase and Simplify> "
    else:
        gram_pref  = "Fix all grammar and spelling errors> "
        style_pref = "Paraphase and Simplify> "

    prefix = gram_pref if is_grammar else style_pref
    strip_prefixes = (gram_pref, style_pref, "Paraphrase: ", "Grammar correction: ")

    outputs_all = []
    
    for i in tqdm(
        range(0, len(texts), batch_size),
        desc=f"{'Grammar' if is_grammar else 'Style'} pass ({model_name})",
        leave=False,
    ):
        chunk = texts[i : i + batch_size]

        # 1) add prefix
        inp_with_pref = [prefix + txt for txt in chunk]

        # 2) pipeline call
        outs = pipe(
            inp_with_pref,
            max_length=max_len,
            batch_size=batch_size,
        )

        # 3) decode & strip prefixes
        for out in outs:
            decoded = out["generated_text"].strip()
            for p in strip_prefixes:
                if decoded.startswith(p):
                    decoded = decoded[len(p):].lstrip()
            outputs_all.append(decoded)

    return outputs_all


In [None]:
def build_eval_df(
    df_out: pd.DataFrame,
    text_column: str,
    src_clean,
    references
) -> pd.DataFrame:
    
    results = []

    # iterate over every system column (skip original + score columns)
    for col in df_out.columns:
        if col == text_column or col.startswith("score_"):
            continue

        preds = df_out[col].astype(str).tolist()

        # ---- metrics -------------------------------------------------------
        sari_score = sari.compute(
            sources=src_clean,
            predictions=preds,
            references=references
        )["sari"]

        bleu_score = bleu.compute(
            predictions=preds,
            references=references
        )["bleu"]

        bert_res  = bertscore.compute(
            predictions=preds,
            references=[r[0] for r in references],
            lang="en"
        )
        bert_f1   = float(np.mean(bert_res["f1"]))

        fkgl_vals = [textstat.flesch_kincaid_grade(p) for p in preds]
        fre_vals  = [textstat.flesch_reading_ease(p) for p in preds]
        fkgl      = float(np.mean(fkgl_vals))
        fre       = float(np.mean(fre_vals))

        ppl_vals  = perplexity.compute(
            model_id="gpt2",
            predictions=preds
        ).get("perplexities")
        ppl       = float(np.mean(ppl_vals))

        # split column name back into grammar / style
        gm, sm = col.split("__")

        results.append({
            "grammar_model": gm,
            "style_model":   sm,
            "sari":  sari_score,
            "bleu":  bleu_score,
            "bert_f1": bert_f1,
            "fkgl":  fkgl,
            "flesch": fre,
            "perplexity": ppl
        })

    return pd.DataFrame(results).sort_values("sari", ascending=False).reset_index(drop=True)


In [23]:
# ----------------------------
# EVALUATION FUNCTIONS
# ----------------------------
def evaluate_models(sources, references):
    # Preprocess and correct grammar
    text_column = "sentences"
    clean_texts = preprocess_texts(sources)
    corrected_cache = {}
    df_out      = pd.DataFrame({text_column: clean_texts})

    # --- Grammar stage ----------------------------------------------------------
    for gm in GRAMMAR_MODELS:
        print(f"Grammar pass – {gm}")
        g_pipe  = grammar_pipes[gm]
        g_fixed = run_pipe_with_prefix(
            g_pipe, clean_texts, gm, is_grammar=True
        )
        corrected_cache[gm] = preprocess_texts(g_fixed)
    print("Grammar done!")

    # --- Style stage ------------------------------------------------------------
    for gm in GRAMMAR_MODELS:
        g_clean = corrected_cache[gm]
        for sm in STYLE_MODELS:
            col_name = f"{gm.split('/')[-1]}__{sm.split('/')[-1]}"
            print(f"Style pass – {gm} → {sm}")
            s_pipe = style_pipes[sm]
            preds  = run_pipe_with_prefix(
                s_pipe, g_clean, sm, is_grammar=False
            )
            df_out[col_name] = preds
    print("Styling done!")

    return build_eval_df(df_out, text_column, clean_texts, references)

In [None]:
# ----------------------------
# RUN EVALUATION
# ----------------------------
df = evaluate_models(sources, references)

Grammar pass – google/flan-t5-small


                                                                                    

Grammar pass – facebook/bart-base


                                                                                  

Grammar done!
Style pass – google/flan-t5-small → sshleifer/distilbart-cnn-12-6


                                                                                           

Style pass – google/flan-t5-small → rajistics/informal_formal_style_transfer


                                                                                                      

Style pass – facebook/bart-base → sshleifer/distilbart-cnn-12-6


Style pass (sshleifer/distilbart-cnn-12-6):  84%|████████▍ | 32/38 [05:22<01:10, 11.78s/it]

In [12]:
# Sort and display
df = df.sort_values('sari', ascending=False).reset_index(drop=True)
print(df)

   grammar_model          style_model       sari      bleu   bert_f1  \
0  flan-t5-small  distilbart-cnn-12-6  35.846313  0.004268  0.827771   
1      bart-base  distilbart-cnn-12-6  35.844019  0.002804  0.827132   
2      bart-base      bart-paraphrase  34.876300  0.000000  0.841546   
3  flan-t5-small      bart-paraphrase  34.846586  0.000000  0.841755   

        fkgl     flesch  perplexity  
0   9.382333  57.733933   43.264453  
1   9.524333  57.252900   43.018335  
2  12.062333  47.044800  108.447141  
3  11.978333  47.335033  111.588177  


In [13]:
# ----------------------------
# SAVE RESULTS
# ----------------------------
csv_path = os.path.join(os.getcwd(), 'pipeline_evaluation_results_huggingface.csv')
df.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")

Results saved to c:\Users\jonat\Documents\UTS\2025 Autumn Sem\42850 NLP\Assignment 3\NLPAssignment\Jonathan Workspace\pipeline_evaluation_results_huggingface.csv


In [None]:
# ----------------------------
# PLOT RESULTS
# ----------------------------
pivot_sari = df.pivot(index='grammar_model', columns='style_model', values='sari')
pivot_bleu = df.pivot(index='grammar_model', columns='style_model', values='bleu')
labels = pivot_sari.index.tolist()
x = np.arange(len(labels))
width = 0.8 / len(STYLE_MODELS)

plt.figure(figsize=(8,5))
for i, sm in enumerate(pivot_sari.columns):
    plt.bar(x + i*width, pivot_sari[sm], width, label=sm)
plt.xticks(x + width*(len(STYLE_MODELS)-1)/2, labels, rotation=45)
plt.ylabel('SARI')
plt.title('SARI by Pipeline Combo')
plt.legend(title='Style Model')
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,5))
for i, sm in enumerate(pivot_bleu.columns):
    plt.bar(x + i*width, pivot_bleu[sm], width, label=sm)
plt.xticks(x + width*(len(STYLE_MODELS)-1)/2, labels, rotation=45)
plt.ylabel('BLEU')
plt.title('BLEU by Pipeline Combo')
plt.legend(title='Style Model')
plt.tight_layout()
plt.show()

In [17]:
# ----------------------------
# HUMAN‑EVAL CSV GENERATOR
# ----------------------------
import pandas as pd

def generate_human_eval_csv(input_csv: str, text_column: str, output_csv: str):
    """Read a CSV, run every Grammar→Style combo on each row, and write a new CSV
    with side‑by‑side outputs ready for human scoring.

    Parameters
    ----------
    input_csv : str
        Path to the source CSV that contains raw presentation sentences.
    text_column : str
        Column name that holds the text to be rewritten.
    output_csv : str
        Destination path for the human‑evaluation CSV.
    """
    df_in = pd.read_csv(input_csv)
    if text_column not in df_in.columns:
        raise KeyError(f"Column '{text_column}' not found in {input_csv}.")

    raw_texts = df_in[text_column].astype(str).tolist()
    # Pre-clean once
    clean_texts = preprocess_texts(raw_texts)
    df_out      = pd.DataFrame({text_column: raw_texts})
    corrected_cache = {}

    # --- Grammar stage ----------------------------------------------------------
    for gm in GRAMMAR_MODELS:
        print(f"Grammar pass – {gm}")
        g_pipe  = grammar_pipes[gm]
        g_fixed = run_pipe_with_prefix(
            g_pipe, clean_texts, gm, is_grammar=True
        )
        corrected_cache[gm] = preprocess_texts(g_fixed)

    # --- Style stage ------------------------------------------------------------
    for gm in GRAMMAR_MODELS:
        g_clean = corrected_cache[gm]
        for sm in STYLE_MODELS:
            col_name = f"{gm.split('/')[-1]}__{sm.split('/')[-1]}"
            print(f"Style pass – {gm} → {sm}")
            s_pipe = style_pipes[sm]
            preds  = run_pipe_with_prefix(
                s_pipe, g_clean, sm, is_grammar=False
            )
            df_out[col_name] = preds

    # Optional blank columns for human scores
    # for gm in GRAMMAR_MODELS:
    #     for sm in STYLE_MODELS:
    #         df_out[f"score_{gm}__{sm}"] = ""  # empty cell to fill manually

    df_out.to_csv(output_csv, index=False)
    print(f"Human‑eval CSV saved to {output_csv}")


In [18]:
generate_human_eval_csv('verbose_samples.csv', 'sentence', 'human_eval_outputs10.csv')

Grammar pass – google/flan-t5-small


                                                                                  

Grammar pass – facebook/bart-base


                                                                                

Style pass – google/flan-t5-small → sshleifer/distilbart-cnn-12-6


                                                                                         

Style pass – google/flan-t5-small → rajistics/informal_formal_style_transfer


                                                                                                    

Style pass – facebook/bart-base → sshleifer/distilbart-cnn-12-6


                                                                                         

Style pass – facebook/bart-base → rajistics/informal_formal_style_transfer


                                                                                                    

Human‑eval CSV saved to human_eval_outputs10.csv




In [None]:
def prepare_human_eval(input_csv: str, output_csv: str, mapping_csv: str):
    """
    Randomize columns A–D per row and prepare a CSV for blind human evaluation.

    Params:
    - input_csv: CSV with columns: sentence, A, B, C, D
    - output_csv: randomized file for human raters
    - mapping_csv: stores the randomized column origin per row (for decoding later)
    """
    df = pd.read_csv(input_csv)
    cols = df.columns.tolist()
    if len(cols) < 5:
        raise ValueError(f"Input CSV must have at least 5 columns, found {len(cols)}: {cols}")
    sentence_col = cols[0]
    model_cols = cols[1:5]
    
    randomized_rows = []
    mappings = []

    for _, row in df.iterrows():
        options = [row[c] for c in model_cols]
        zipped = list(zip(model_cols, options))
        random.shuffle(zipped)

        labels, shuffled_outputs = zip(*zipped)
        randomized_rows.append({
            'sentence': row[sentence_col],
            'option_1': shuffled_outputs[0],
            'option_2': shuffled_outputs[1],
            'option_3': shuffled_outputs[2],
            'option_4': shuffled_outputs[3],
            'chosen': ''  # to be filled by rater (1–4)
        })

        mappings.append({
            'row_id': len(mappings),
            'option_1': labels[0],
            'option_2': labels[1],
            'option_3': labels[2],
            'option_4': labels[3]
        })

    pd.DataFrame(randomized_rows).to_csv(output_csv, index=False)
    pd.DataFrame(mappings).to_csv(mapping_csv, index=False)
    print(f"Human evaluation CSV saved to: {output_csv}")
    print(f"Mapping CSV saved to: {mapping_csv}")

In [None]:
def analyze_human_eval(eval_csv: str, mapping_csv: str):
    """
    Read the human evaluation and determine which model (A/B/C/D) was preferred most.

    Params:
    - eval_csv: the CSV with human ratings (option_1, ..., chosen)
    - mapping_csv: the shuffled mapping created earlier
    """
    df_eval = pd.read_csv(eval_csv)
    df_map = pd.read_csv(mapping_csv)

    vote_counter = {'A': 0, 'B': 0, 'C': 0, 'D': 0}
    total_votes = 0

    for idx, row in df_eval.iterrows():
        chosen = row.get('chosen')
        if pd.isna(chosen) or str(chosen).strip() not in {'1', '2', '3', '4'}:
            continue  # skip unscored rows

        chosen_idx = int(chosen.strip())
        col_origin = df_map.loc[idx, f'option_{chosen_idx}']
        vote_counter[col_origin] += 1
        total_votes += 1

    print("Human Evaluation Results (model preference counts):")
    for model, count in vote_counter.items():
        pct = (count / total_votes * 100) if total_votes else 0
        print(f"  {model}: {count} votes ({pct:.1f}%)")
    
    best = max(vote_counter, key=vote_counter.get)
    print(f"\nBest performing model (by human preference): {best}")

In [None]:
# Step 1: Create blinded evaluation sheet
prepare_human_eval("results.csv", "for_human_rating.csv", "shuffle_map.csv")

# Step 2 (after rating): Analyze winner
analyze_human_eval("for_human_rating.csv", "shuffle_map.csv")