In [1]:
# %%capture
# %%script echo skipping
%load_ext autoreload
%load_ext dotenv
%dotenv
%pip install transformers evaluate
%pip install nltk absl-py rouge_score
%pip install bleu sacrebleu
%pip install bleu sacremoses
!huggingface-cli login --token $HUGGING_FACE_TOKEN


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
import os
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
)
from utils.metric import calculate_scores

### Loading model

In [3]:
# model_name = "grammarly/coedit-large"
model_name = "google/flan-t5-large"
# model_name = "google/gemma-2b-it"
# model_name = "google/gemma-7b-it"
# model_name = "google/gemma-7b"

model_alias = model_name.replace("/", "_")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     # torch_dtype=torch.float16,
#     # revision="float16",
#     device_map="auto",
# )
model = model.to(device)

'Device: cuda'


In [4]:
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram(Gb): {utilization_str["total_memory"]}/{utilization_str["memory_used"]}/"
    f"{utilization_str["cuda_allocated"]}/{utilization_str["cuda_reserved"]}/{utilization_str["ram_usage"]}"
)

actual_fraction = 0.95
available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(f"Total/used/available memory (Gb): {utilization["total_memory"]/1024**3:.2f}/{utilization["memory_used"]/1024**3:.2f}/{available_memory/1024**3:.2f}")
print(f"Recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

# torch.cuda.empty_cache()
# torch.empty(utilization["total_memory"] // 2, dtype=torch.int8, device='cuda')
# print_utilization()

Total/trainable params: 783150080/783150080
total/used/cuda/res/ram(Gb): 10.00/4.30/3.06/3.06/12.29
Total/used/available memory (Gb): 10.00/4.30/5.70
Recommended/actual fraction: 0.57/0.95


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# reference = " Some even provided gateways, such as UFGATE, by which members could send / receive e-mail to and from the Internet via UUCP, and many FidoNet discussion groups were shared via gateway to Usenet."
input_text = "Fix grammatical errors in this sentence: I does work."
# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt")
input_ids = input_ids.to(device)

outputs = model.generate(**input_ids, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

I does work.


### Datasets

In [6]:
# api = HfApi()
# coedit_info = api.dataset_info("grammarly/coedit")
# pprint(coedit_info)

grammarly_dataset = load_dataset("grammarly/coedit")
pprint(grammarly_dataset)

unique_categories = set(grammarly_dataset)
pprint(unique_categories)

unique_tasks = set(grammarly_dataset["train"]["task"])
pprint(unique_tasks)

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
{'validation', 'train'}
{'coherence', 'paraphrase', 'gec', 'clarity', 'simplification', 'neutralize'}


In [7]:
def get_samples(dataset, category="validation", task="gec", num_samples=1, seed=42):
    return dataset[category].shuffle(seed=seed).filter(lambda item: item["task"] == task).select(range(num_samples))

def print_samples(samples) -> None:
    for item in samples:
        pfx, src = item["src"].split(": ", 1)
        print(f"[{item['task']}] {pfx}")
        print(f"src: {src}")
        print(f"tgt: {item['tgt']}")


print_samples(get_samples(grammarly_dataset, num_samples=2))

# input_ids = tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
# outputs = model.generate(input_ids, max_length=256)
# corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
# return {"processed": corrected}


[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
[gec] Fix grammaticality in this sentence
src: Dear friends, I hope you should correctly but I can gives you some opinion, I guess that is a good idea if you go to a small schools, under you can met a lot on people and there are more closed friend of course you cannot like that opcion if you like the biggest once, so in that ways you can go from the other school.
tgt: Dear friend, I hope you choose correctly but I can give you my opinion. I guess that it's a good idea if you go to a small school, because you can meet a lot of people and make more close friends of course you won't like that option if you like the bigger one, so in that case you should go to the other school.


### Metrics

In [8]:
rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")

#### Rouge metric

In [7]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = rouge_metric.compute(
    predictions=samples['src'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'rouge1': 0.7150747966331101,
 'rouge2': 0.5167050375929942,
 'rougeL': 0.7005677840072502,
 'rougeLsum': 0.7007083564381789}


#### _GLUE metric_

In [8]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=2)
pprint(object=samples)
print_samples([samples[0]])

src_input_ids = tokenizer(samples["src"][0], return_tensors="pt", padding=True).input_ids
tgt_input_ids = tokenizer(samples["tgt"][0], return_tensors="pt", padding=True).input_ids
pprint(src_input_ids[0])
pprint(tgt_input_ids[0])

# score = glue_metric.compute(predictions=src_input_ids[0], references=tgt_input_ids[0])
# score = glue_metric.compute(predictions=samples["src"], references=samples["tgt"])
# pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 2
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
tensor([14269, 19519,    16,    48,  7142,    10,   493,  6195,   388,    55,
            1])
tensor([ 493, 6195,    6,  388,   55,    1])


#### SacreBLEU metric

In [12]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = sacreblue_metric.compute(predictions=samples["src"], references=samples["tgt"])
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'bp': 1.0,
 'counts': [3886, 2743, 1965, 1419],
 'precisions': [70.79613773000547,
                50.899981443681575,
                37.152580828133864,
                27.346309500867218],
 'ref_len': 5090,
 'score': 43.74251258938969,
 'sys_len': 5489,
 'totals': [5489, 5389, 5289, 5189]}


#### SARI metric

In [13]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

new_samples = samples.map(lambda item: {"tgts": [item["tgt"]]})
new_samples["tgts"][:5]

# sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
# predictions=["About 95 you now get in.","About 95 you now get in."]
# references=[["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."],["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."]]

score = sari_metric.compute(
  sources=new_samples['src'],
  predictions=new_samples['src'],
  references=new_samples['tgts']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'sari': 52.48853096503606}


#### Exact match (EM) metric

In [9]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = em_metric.compute(
    predictions=samples['tgt'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'exact_match': 1.0}


### Datasets

#### IteraTeR
* https://huggingface.co/datasets/wanyu/IteraTeR_v2
* https://huggingface.co/datasets/wanyu/IteraTeR_full_sent

In [9]:
# iterater_dataset = load_dataset("wanyu/IteraTeR_v2") # human in the loop
iterater_dataset = load_dataset("wanyu/IteraTeR_full_sent")
pprint(iterater_dataset)
iterater_validation_dataset = load_dataset("wanyu/IteraTeR_full_sent", split="validation")
pprint(iterater_validation_dataset)
# pprint(iterater_validation_dataset['validation'][0])

DatasetDict({
    train: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 157579
    })
    validation: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 19705
    })
    test: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 19703
    })
})
Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
    num_rows: 19705
})


In [10]:
verbolizers = {
    "gce": {
        "tokens": ["<fluency>"],
        "verbs": [
            "Fix grammar",
            "Fix grammar in this sentence",
            "Fix grammar in the sentence",
            "Fix grammar errors",
            "Fix grammatical errors",
            "Fix grammaticality",
            "Fix all grammatical errors",
            "Fix grammatical errors in this sentence",
            "Fix grammar errors in this sentence",
            "Fix grammatical mistakes in this sentence",
            "Fix grammaticality in this sentence",
            "Fix grammaticality of the sentence",
            "Fix disfluencies in the sentence",
            "Make the sentence grammatical",
            "Make the sentence fluent",
            "Fix errors in this text",
            "Update to remove grammar errors",
            "Remove all grammatical errors from this text",
            "Improve the grammar of this text",
            "Improve the grammaticality",
            "Improve the grammaticality of this text",
            "Improve the grammaticality of this sentence,",
            "Grammar improvements",
            "Remove grammar mistakes",
            "Remove grammatical mistakes",
            "Fix the grammar mistakes",
            "Fix grammatical mistakes",
        ],
    }
}

In [11]:
def substitute_verbolizer(text, verbolizer, count=[0]):
    verbs = verbolizers[verbolizer]["verbs"]

    verb = verbs[count[0]]
    tokens = verbolizers[verbolizer]["tokens"]
    replaced_text = text
    for t in tokens:
        replaced_text = text.replace(t, f"{verb}:")
        # pprint(f"> t: {t}, verb: {verb}, text: {text}, replaced_text: {replaced_text}")

    count[0] += 1
    if count[0] >= len(verbs):
        count[0] = 0

    return replaced_text


def get_iterater_samples(label, category="validation", num_samples=0, seed=42, confidence_threshold=0.9):
    filtered_samples = (
        iterater_dataset[category]
        .shuffle(seed=seed)
        .filter(lambda item: item["labels"] == label and float(item["confidence"]) >= confidence_threshold)
    )
    max_samples = len(filtered_samples)
    selected = max_samples if num_samples == 0 else num_samples
    print(f"max_samples: {max_samples}, selected: {selected}, num_samples: {num_samples}")
    samples = filtered_samples.select(range(selected))

    return samples.map(
        lambda item: {
            "task": substitute_verbolizer(item["before_sent_with_intent"], "gce"),
            "source": item["before_sent"],
            "reference": item["after_sent"],
            "references": [item["after_sent"]],
        },
        remove_columns=[
            "before_sent_with_intent",
            "before_sent",
            "after_sent",
            "labels",
            "confidence",
            "doc_id",
            "revision_depth",
        ],
    )


samples = get_iterater_samples(label="fluency", num_samples=5)
pprint(samples)
pprint(samples["task"][:2])

max_samples: 5078, selected: 5, num_samples: 5
Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 5
})
["Fix grammar:  We don't have enough good Open Source games -- it's a waste to "
 'pour all the resources we have into one. :) Wesnoth has dwarves with guns, '
 "World of Warcraft'' has gnomes and goblins with explosives and flying "
 'machines -- where do you, personally, define the limits of the fantasy '
 'genre?',
 'Fix grammar in this sentence:  In 2001, they successfully nominated Bohemian '
 'Hall, still a vibrant community center/beer garden started by Czech '
 'immigrants in Astoria, Queens, and the Casa Amadeo Music Store, the oldest, '
 'continuously occupied Latin music store in New York City,  as census sites '
 'to the National Register of Historic Places.']


### GPU processing

In [12]:
%%time

import time

total_samples = 100
samples = get_iterater_samples(label="fluency", num_samples=total_samples)
# samples = get_iterater_samples(label="fluency")
pprint(samples)

processed_samples = samples


def model_process(batch, idx, **kwargs):
    num_samples = len(batch["task"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    input_ids = tokenizer(batch["task"], padding=True, return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(item["task"], return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=512)
    # print(f"outputs: {outputs}")
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str["total_memory"]}/{utilization_str["memory_used"]}/"
        f"{utilization_str["cuda_allocated"]}/{utilization_str["cuda_reserved"]}/{utilization_str["ram_usage"]} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str["total_memory"]}/{utilization_str["memory_used"]}/"
    f"{utilization_str["cuda_allocated"]}/{utilization_str["cuda_reserved"]}/{utilization_str["ram_usage"]}"
)

start_time = time.time()

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
processed_samples = samples.map(
    model_process,
    fn_kwargs={
        "model": model,
        "tokenizer": tokenizer,
        "total_samples": total_samples,
    },
    num_proc=1,
    batched=True,
    batch_size=20,
    with_indices=True,
)

end_time = time.time()
elapsed_time = end_time - start_time
processed_sps = total_samples / elapsed_time
print(f"processed_sps: {processed_sps}")

pprint(processed_samples)
pprint(processed_samples["processed"][:2])

saved_samples = processed_samples.remove_columns(["references"])
flat_df = pd.DataFrame.from_records(saved_samples)
flat_df.to_json(f"samples/{model_alias}_frames.json", orient="records")

max_samples: 5078, selected: 100, num_samples: 100
Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
})
total/used/cuda/res/ram (Gb): 10.00/4.34/3.07/3.08/12.36


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

0-19/100 | total/used/cuda/res/ram (Gb): 10.00/5.88/3.07/4.65/14.04 | batch/sps: 20/6.64
20-39/100 | total/used/cuda/res/ram (Gb): 10.00/7.02/3.07/5.81/14.05 | batch/sps: 20/4.76
40-59/100 | total/used/cuda/res/ram (Gb): 10.00/7.01/3.07/5.81/14.05 | batch/sps: 20/4.96
60-79/100 | total/used/cuda/res/ram (Gb): 10.00/9.75/3.07/9.17/14.06 | batch/sps: 20/2.84
80-99/100 | total/used/cuda/res/ram (Gb): 10.00/9.86/3.07/9.17/14.06 | batch/sps: 20/7.20
processed_sps: 3.864121324541734
Dataset({
    features: ['task', 'source', 'reference', 'references', 'processed'],
    num_rows: 100
})
["We don't have enough good Open Source games -- it's a waste to pour all the "
 'resources we have into one. :) Wesnoth has dwarves with guns, World of '
 "Warcraft'' has gnomes and goblins with explosives and flying machines -- "
 'where do you, personally, define the limits of the fantasy genre?',
 'In 2001, they successfully nominated Bohemian Hall, still a vibrant '
 'community center/beer garden started 

In [23]:
scores = calculate_scores(processed_samples)
# pprint(scores)

score_paths = [
    "rouge.rouge1",
    # "rouge.rouge2",
    # "rouge.rougeL",
    # "rouge.rougeLsum",
    "sacreblue.score",
    "sari.sari",
    "em.exact_match",
]

base_frame = {
    "model": model_name,
    "total_samples": total_samples,
    "sps": processed_sps,
    "task": "fluency",
    "total_params": total_params,
}

normalized_scores = {}
for k, v in scores.items():
    for k2, v2 in v.items():
        if not isinstance(v2, list):
            # normalized_scores[f"score.{k}.{k2}"] = v2
            path = f"{k}.{k2}"
            if path in score_paths:
                normalized_scores[f"score.{k}.{k2}"] = v2
# pprint(normalized_scores)

flat_frame = base_frame.copy()
flat_frame.update(normalized_scores)
# pprint(frame)

all_flat_frames = []
if os.path.exists("results/all-flat-frames.csv"):
    all_flat_frames = pd.read_csv("results/all-flat-frames.csv").to_dict("records")

flat_df = pd.DataFrame.from_records([flat_frame])
# pprint(df)
# print(flat_df.head().to_markdown(index=True))
# flat_df.to_csv(f"results/{model_alias}_flat-frame.csv", index=False, float_format="%.2f")
flat_df.to_csv(f"results/{model_alias}_flat-frame.csv", index=False)

all_flat_frames.append(flat_frame)
all_flat_dfs = pd.DataFrame.from_records(all_flat_frames)
all_flat_dfs.to_csv(f"results/all-flat-frames.csv", index=False)


full_frame = base_frame.copy()
full_frame.update({"scores": scores})
# pprint(full_frame)
full_df = pd.DataFrame.from_records([full_frame])

pprint(full_df)
full_df.to_json(f"results/{model_alias}_full-frame.json", orient="records")
# full_df.to_json("results/frame.json", index=False)

# more_frames = [frame, frame, frame]
# more_df = pd.DataFrame.from_records(more_frames)
# more_df.to_csv("results/frames.csv", index=False)

                  model  total_samples       sps     task  total_params  \
0  google/flan-t5-large            100  3.864121  fluency     783150080   

                                              scores  
0  {'rouge': {'rouge1': 0.971745426579361, 'rouge...  
