In [36]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import textstat

In [37]:
# ----------------------------
# PREPROCESSING FUNCTIONS
# ----------------------------
def preprocess_texts(texts):
    return [" ".join(t.strip().split()) for t in texts]

def add_control_token(texts, style_model, token="<simplify>"):
    """
    Conditionally add a control token before texts only for the specified style model.
    """
    if style_model == "google/flan-t5-base":
        return [f"{token} {t}" for t in texts]
    return texts

In [49]:
# ----------------------------
# CONFIGURATION & MODELS
# ----------------------------
GRAMMAR_MODELS = [
    "google/flan-t5-small",
    "facebook/bart-base"
]
STYLE_MODELS = [
    "google/flan-t5-base",
    "sshleifer/distilbart-cnn-12-6"
]

DEVICE = 0 if torch.cuda.is_available() else -1
BATCH_SIZE = 8
PIPELINE_MAX_LENGTH = 256
DATASET_SUBSET_LENGTH = 1000

In [50]:
# ----------------------------
# LOAD STYLE DATASET (WikiAuto-Manual)
# ----------------------------
dataset = load_dataset("chaojiang06/wiki_auto", "manual")

# Randomly sample 100 items from the test split for evaluation
test_split_full = dataset["test"]
test_split = test_split_full.shuffle(seed=42).select(range(min(DATASET_SUBSET_LENGTH, len(test_split_full))))

if 'normal_sentence' in test_split.column_names and 'simple_sentence' in test_split.column_names:
    sources = test_split['normal_sentence']
    references = [[r] for r in test_split['simple_sentence']]
else:
    raise KeyError(f"Expected 'normal_sentence' and 'simple_sentence' in dataset columns: {test_split.column_names}")

In [51]:
len(sources)

1000

In [52]:
# ----------------------------
# BUILD INFERENCE PIPELINES
# ----------------------------
grammar_pipes = {m: pipeline("text2text-generation", model=m, device=DEVICE) for m in GRAMMAR_MODELS}
style_pipes   = {m: pipeline("text2text-generation", model=m, device=DEVICE) for m in STYLE_MODELS}

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


In [53]:
# ----------------------------
# LOAD METRICS
# ----------------------------
bleu       = evaluate.load("bleu")
bertscore  = evaluate.load("bertscore")
perplexity = evaluate.load("perplexity", module_type="metric")
sari       = evaluate.load("sari")

In [54]:
# ----------------------------
# EVALUATION FUNCTIONS
# ----------------------------
def evaluate_combination(gm, sm, sources, references):
    # Preprocess and correct grammar
    clean_src = preprocess_texts(sources)
    
    # 1️Grammar correction with progress bar
    g_texts = []
    for idx in tqdm(range(0, len(clean_src), BATCH_SIZE),
                    desc=f"Grammar {gm}", leave=False):
        batch_src = clean_src[idx:idx + BATCH_SIZE]
        outs = grammar_pipes[gm](batch_src,
                                max_length=PIPELINE_MAX_LENGTH,
                                batch_size=BATCH_SIZE)
        g_texts.extend([o['generated_text'] for o in outs])
    g_clean = preprocess_texts(g_texts)
    print("Grammar done!")
    
    # Style simplification with progress bar
    preds = []
    for idx in tqdm(range(0, len(g_clean), BATCH_SIZE),
                    desc=f"Style {sm}", leave=False):
        batch_in = add_control_token(g_clean[idx:idx + BATCH_SIZE], sm)
        outs = style_pipes[sm](batch_in,
                               max_length=PIPELINE_MAX_LENGTH,
                               batch_size=BATCH_SIZE)
        preds.extend([o['generated_text'] for o in outs])
    print("Styling done!")

    # Compute metrics
    sari_score = sari.compute(sources=clean_src, predictions=preds, references=references)["sari"]
    bleu_score = bleu.compute(predictions=preds, references=references)["bleu"]
    bert_res   = bertscore.compute(predictions=preds, references=[r[0] for r in references], lang="en")
    bert_f1    = np.mean(bert_res['f1'])
    
    # Readability metrics via textstat, averaged over all outputs
    fkgl_vals = [textstat.flesch_kincaid_grade(p) for p in preds]
    fre_vals  = [textstat.flesch_reading_ease(p) for p in preds]
    fkgl       = sum(fkgl_vals) / len(fkgl_vals)
    fre        = sum(fre_vals) / len(fre_vals)
    
    ppl_res    = perplexity.compute(model_id="gpt2", predictions=preds)
    ppl        = np.mean(ppl_res.get('perplexities', ppl_res))
    return {
        'grammar_model': gm,
        'style_model': sm,
        'sari': sari_score,
        'bleu': bleu_score,
        'bert_f1': bert_f1,
        'fkgl': fkgl,
        'flesch': fre,
        'perplexity': ppl
    }

In [55]:
def evaluate_all(sources, references):
    results = []
    for gm in GRAMMAR_MODELS:
        for sm in STYLE_MODELS:
            res = evaluate_combination(gm, sm, sources, references)
            results.append(res)
    return pd.DataFrame(results)

In [56]:
# ----------------------------
# RUN EVALUATION
# ----------------------------
df = evaluate_all(sources, references)

                                                                               

Grammar done!


                                                                            

Styling done!


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 63/63 [04:04<00:00,  3.88s/it]
                                                                               

Grammar done!


                                                                                      

Styling done!


100%|██████████| 63/63 [01:09<00:00,  1.10s/it]
                                                                             

Grammar done!


                                                                            

Styling done!


100%|██████████| 63/63 [02:45<00:00,  2.63s/it]
                                                                             

Grammar done!


                                                                                      

Styling done!


100%|██████████| 63/63 [01:10<00:00,  1.12s/it]


In [57]:
# Sort and display
df = df.sort_values('sari', ascending=False).reset_index(drop=True)
print(df)

          grammar_model                    style_model       sari      bleu  \
0  google/flan-t5-small            google/flan-t5-base  43.714773  0.005583   
1  google/flan-t5-small  sshleifer/distilbart-cnn-12-6  42.296866  0.003587   
2    facebook/bart-base            google/flan-t5-base  40.245766  0.006873   
3    facebook/bart-base  sshleifer/distilbart-cnn-12-6  35.818627  0.004208   

    bert_f1     fkgl    flesch  perplexity  
0  0.832651  11.5476  42.58575  180.930183  
1  0.826500   7.8150  66.08709   27.124497  
2  0.825945  13.1190  27.05256  365.127896  
3  0.827489   9.3895  59.00918   34.519559  


In [58]:
# ----------------------------
# SAVE RESULTS
# ----------------------------
csv_path = os.path.join(os.getcwd(), 'pipeline_evaluation_results.csv')
df.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")

Results saved to c:\Users\jonat\Documents\UTS\2025 Autumn Sem\42850 NLP\Assignment 3\NLPAssignment\Jonathan Workspace\pipeline_evaluation_results.csv


In [None]:
# ----------------------------
# PLOT RESULTS
# ----------------------------
pivot_sari = df.pivot(index='grammar_model', columns='style_model', values='sari')
pivot_bleu = df.pivot(index='grammar_model', columns='style_model', values='bleu')
labels = pivot_sari.index.tolist()
x = np.arange(len(labels))
width = 0.8 / len(STYLE_MODELS)

plt.figure(figsize=(8,5))
for i, sm in enumerate(pivot_sari.columns):
    plt.bar(x + i*width, pivot_sari[sm], width, label=sm)
plt.xticks(x + width*(len(STYLE_MODELS)-1)/2, labels, rotation=45)
plt.ylabel('SARI')
plt.title('SARI by Pipeline Combo')
plt.legend(title='Style Model')
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,5))
for i, sm in enumerate(pivot_bleu.columns):
    plt.bar(x + i*width, pivot_bleu[sm], width, label=sm)
plt.xticks(x + width*(len(STYLE_MODELS)-1)/2, labels, rotation=45)
plt.ylabel('BLEU')
plt.title('BLEU by Pipeline Combo')
plt.legend(title='Style Model')
plt.tight_layout()
plt.show()

In [None]:
# ----------------------------
# HUMAN‑EVAL CSV GENERATOR
# ----------------------------
import pandas as pd

def generate_human_eval_csv(input_csv: str, text_column: str, output_csv: str):
    """Read a CSV, run every Grammar→Style combo on each row, and write a new CSV
    with side‑by‑side outputs ready for human scoring.

    Parameters
    ----------
    input_csv : str
        Path to the source CSV that contains raw presentation sentences.
    text_column : str
        Column name that holds the text to be rewritten.
    output_csv : str
        Destination path for the human‑evaluation CSV.
    """
    df_in = pd.read_csv(input_csv)
    if text_column not in df_in.columns:
        raise KeyError(f"Column '{text_column}' not found in {input_csv}.")

    raw_texts = df_in[text_column].astype(str).tolist()
    # Pre-clean once
    clean_texts = preprocess_texts(raw_texts)

    # For each grammar model, pre‑compute corrected text
    corrected_cache = {}
    for gm in GRAMMAR_MODELS:
        g_texts = []
        for idx in tqdm(range(0, len(clean_texts), BATCH_SIZE),
                        desc=f"Grammar pass ({gm})", leave=False):
            batch_src = clean_texts[idx:idx + BATCH_SIZE]
            outs = grammar_pipes[gm](batch_src, max_length=PIPELINE_MAX_LENGTH,
                                     batch_size=BATCH_SIZE)
            g_texts.extend([o['generated_text'] for o in outs])
        corrected_cache[gm] = preprocess_texts(g_texts)

    # Build new DataFrame starting with the original text
    df_out = pd.DataFrame({text_column: raw_texts})

    # For every Grammar→Style combo, add a column
    for gm in GRAMMAR_MODELS:
        g_clean = corrected_cache[gm]
        for sm in STYLE_MODELS:
            col_name = f"{gm}__{sm}"
            preds = []
            for idx in tqdm(range(0, len(g_clean), BATCH_SIZE),
                            desc=f"Style pass ({gm}->{sm})", leave=False):
                batch_in = add_control_token(g_clean[idx:idx+BATCH_SIZE], sm)
                outs = style_pipes[sm](batch_in, max_length=PIPELINE_MAX_LENGTH,
                                       batch_size=BATCH_SIZE)
                preds.extend([o['generated_text'] for o in outs])
            df_out[col_name] = preds

    # Optional blank columns for human scores
    for gm in GRAMMAR_MODELS:
        for sm in STYLE_MODELS:
            df_out[f"score_{gm}__{sm}"] = ""  # empty cell to fill manually

    df_out.to_csv(output_csv, index=False)
    print(f"Human‑eval CSV saved to {output_csv}")


In [None]:
generate_human_eval_csv('input_transcripts.csv', 'sentence', 'human_eval_outputs.csv')