In [1]:
%%capture
%load_ext dotenv
%dotenv
%pip install transformers evaluate
%pip install nltk absl-py rouge_score
%pip install bleu sacrebleu
%pip install bleu sacremoses
!huggingface-cli login --token $HUGGING_FACE_TOKEN


In [2]:
from datasets import load_dataset
from huggingface_hub import HfApi
from pprint import pprint
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer
import evaluate
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")
# torch.cuda.empty_cache()


'Device: cuda'


### Langage Models

In [27]:
coedit_large_tokenizer = AutoTokenizer.from_pretrained("grammarly/coedit-large")
coedit_large_model = T5ForConditionalGeneration.from_pretrained("grammarly/coedit-large")
coedit_large_model=coedit_large_model.to(device)

print(f"Allocated: {torch.cuda.memory_allocated(device)/1024**3:.2f} GB")
print(f"Reserved: {torch.cuda.memory_reserved(device)/1024**3:.2f} GB")

Allocated: 7.62 GB
Reserved: 7.62 GB


In [32]:
flan_t5_large_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl", return_attention_mask=False)
flan_t5_large_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")
# flan_t5_large_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")
# flan_t5_large_model = flan_t5_large_model.to(device)

print(f"Allocated: {torch.cuda.memory_allocated(device)/1024**3:.2f} GB")
print(f"Reserved: {torch.cuda.memory_reserved(device)/1024**3:.2f} GB")

total_params = sum(p.numel() for p in flan_t5_large_model.parameters())
print(f"Total Parameters: {total_params}")
total_trainable_params = sum(p.numel() for p in flan_t5_large_model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {total_trainable_params}")
# total_memory_GB = total_params * 4 / (1024**3)
# print(f"Estimated model memory: {total_memory_GB:.2f} GB")
# for param_tensor in flan_t5_large_model.state_dict():
#     print(param_tensor, "\t", flan_t5_large_model.state_dict()[param_tensor].size())


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Allocated: 7.63 GB
Reserved: 7.64 GB
Total Parameters: 2849757184
Trainable Parameters: 2849757184


In [33]:
# %%script echo skipping

prompt = "translate English to German: How old are you?"
input_ids = flan_t5_large_tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
outputs = flan_t5_large_model.generate(input_ids, max_new_tokens=200)
print(flan_t5_large_tokenizer.decode(outputs[0], skip_special_tokens=True))



Wie alt sind Sie?


### Datasets

In [6]:
# api = HfApi()
# coedit_info = api.dataset_info("grammarly/coedit")
# pprint(coedit_info)

grammarly_dataset = load_dataset("grammarly/coedit")
pprint(grammarly_dataset)

unique_categories = set(grammarly_dataset)
pprint(unique_categories)

unique_tasks = set(grammarly_dataset["train"]["task"])
pprint(unique_tasks)

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
{'train', 'validation'}
{'coherence', 'simplification', 'paraphrase', 'neutralize', 'gec', 'clarity'}


In [7]:
def get_samples(dataset, category="validation", task="gec", num_samples=1, seed=42):
    return dataset[category].shuffle(seed=seed).filter(lambda item: item["task"] == task).select(range(num_samples))

def print_samples(samples) -> None:
    for item in samples:
        pfx, src = item["src"].split(": ", 1)
        print(f"[{item['task']}] {pfx}")
        print(f"src: {src}")
        print(f"tgt: {item['tgt']}")


print_samples(get_samples(grammarly_dataset, num_samples=2))

# input_ids = coedit_large_tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
# outputs = coedit_large_model.generate(input_ids, max_length=256)
# corrected = coedit_large_tokenizer.decode(outputs[0], skip_special_tokens=True)
# return {"processed": corrected}


[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
[gec] Fix grammaticality in this sentence
src: Dear friends, I hope you should correctly but I can gives you some opinion, I guess that is a good idea if you go to a small schools, under you can met a lot on people and there are more closed friend of course you cannot like that opcion if you like the biggest once, so in that ways you can go from the other school.
tgt: Dear friend, I hope you choose correctly but I can give you my opinion. I guess that it's a good idea if you go to a small school, because you can meet a lot of people and make more close friends of course you won't like that option if you like the bigger one, so in that case you should go to the other school.


### Metrics

#### Rouge metric

In [8]:
rouge_metric = evaluate.load("rouge")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = rouge_metric.compute(
    predictions=samples['src'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'rouge1': 0.713919085240824,
 'rouge2': 0.5166989378718392,
 'rougeL': 0.7006572327427825,
 'rougeLsum': 0.7002075122548184}


#### _GLUE metric_

In [9]:
glue_metric = evaluate.load("glue", "stsb")

samples = get_samples(grammarly_dataset, task="gec", num_samples=2)
pprint(object=samples)
print_samples([samples[0]])

src_input_ids = coedit_large_tokenizer(samples["src"][0], return_tensors="pt", padding=True).input_ids
tgt_input_ids = coedit_large_tokenizer(samples["tgt"][0], return_tensors="pt", padding=True).input_ids
pprint(src_input_ids[0])
pprint(tgt_input_ids[0])

# score = glue_metric.compute(predictions=src_input_ids[0], references=tgt_input_ids[0])
# score = glue_metric.compute(predictions=samples["src"], references=samples["tgt"])
# pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 2
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
tensor([14269, 19519,    16,    48,  7142,    10,   493,  6195,   388,    55,
            1])
tensor([ 493, 6195,    6,  388,   55,    1])


#### SacreBLEU metric

In [10]:
sacreblue_metric = evaluate.load("sacrebleu")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = sacreblue_metric.compute(predictions=samples["src"], references=samples["tgt"])
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'bp': 1.0,
 'counts': [3886, 2743, 1965, 1419],
 'precisions': [70.79613773000547,
                50.899981443681575,
                37.152580828133864,
                27.346309500867218],
 'ref_len': 5090,
 'score': 43.74251258938969,
 'sys_len': 5489,
 'totals': [5489, 5389, 5289, 5189]}


#### SARI metric

In [11]:
sari_metric = evaluate.load("sari")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

new_samples = samples.map(lambda item: {"tgts": [item["tgt"]]})
new_samples["tgts"][:5]

# sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
# predictions=["About 95 you now get in.","About 95 you now get in."]
# references=[["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."],["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."]]

score = sari_metric.compute(
  sources=new_samples['src'],
  predictions=new_samples['src'],
  references=new_samples['tgts']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'sari': 52.48853096503606}


#### Exact match (EM) metric

In [12]:
em_metric = evaluate.load("exact_match")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = em_metric.compute(
    predictions=samples['tgt'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'exact_match': 1.0}


### Datasets

#### IteraTeR
* https://huggingface.co/datasets/wanyu/IteraTeR_v2
* https://huggingface.co/datasets/wanyu/IteraTeR_full_sent

In [13]:
%pip install psutil
import psutil
# print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [23]:
# iterater_dataset = load_dataset("wanyu/IteraTeR_v2") # human in the loop
iterater_dataset = load_dataset("wanyu/IteraTeR_full_sent")
iterater_validation_dataset = load_dataset("wanyu/IteraTeR_full_sent", split="validation")
pprint(iterater_validation_dataset)
# pprint(iterater_validation_dataset['validation'][0])


Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
    num_rows: 19705
})


In [21]:
verbolizers = {
    "gce": {
        "tokens": ["<fluency>"],
        "verbs": [
            "Fix grammar",
            "Fix grammar in this sentence",
            "Fix grammar in the sentence",
            "Fix grammar errors",
            "Fix grammatical errors",
            "Fix grammaticality",
            "Fix all grammatical errors",
            "Fix grammatical errors in this sentence",
            "Fix grammar errors in this sentence",
            "Fix grammatical mistakes in this sentence",
            "Fix grammaticality in this sentence",
            "Fix grammaticality of the sentence",
            "Fix disfluencies in the sentence",
            "Make the sentence grammatical",
            "Make the sentence fluent",
            "Fix errors in this text",
            "Update to remove grammar errors",
            "Remove all grammatical errors from this text",
            "Improve the grammar of this text",
            "Improve the grammaticality",
            "Improve the grammaticality of this text",
            "Improve the grammaticality of this sentence,",
            "Grammar improvements",
            "Remove grammar mistakes",
            "Remove grammatical mistakes",
            "Fix the grammar mistakes",
            "Fix grammatical mistakes",
        ],
    }
}

In [24]:
def substitute_verbolizer(text, verbolizer, count=[0]):
    verbs = verbolizers[verbolizer]["verbs"]

    verb = verbs[count[0]]
    tokens = verbolizers[verbolizer]["tokens"]
    replaced_text = text
    for t in tokens:
        replaced_text = text.replace(t, f"{verb}:")
        # pprint(f"> t: {t}, verb: {verb}, text: {text}, replaced_text: {replaced_text}")

    count[0] += 1
    if count[0] >= len(verbs):
        count[0] = 0

    return replaced_text


def get_iterater_samples(label, category="validation", num_samples=1, seed=42, confidence_threshold=0.9):
    samples = (
        iterater_dataset[category]
        .shuffle(seed=seed)
        .filter(lambda item: item["labels"] == label and float(item["confidence"]) >= confidence_threshold)
        .select(range(num_samples))
    )
    return samples.map(
        lambda item: {
            "task": substitute_verbolizer(item["before_sent_with_intent"], "gce"),
            "source": item["before_sent"],
            "reference": item["after_sent"],
            "references": [item["after_sent"]],
        },
        remove_columns=[
            "before_sent_with_intent",
            "before_sent",
            "after_sent",
            "labels",
            "confidence",
            "doc_id",
            "revision_depth",
        ],
    )


samples = get_iterater_samples(label="fluency", num_samples=5)
pprint(samples)
pprint(samples["task"][:2])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 5
})
["Fix grammar:  We don't have enough good Open Source games -- it's a waste to "
 'pour all the resources we have into one. :) Wesnoth has dwarves with guns, '
 "World of Warcraft'' has gnomes and goblins with explosives and flying "
 'machines -- where do you, personally, define the limits of the fantasy '
 'genre?',
 'Fix grammar in this sentence:  In 2001, they successfully nominated Bohemian '
 'Hall, still a vibrant community center/beer garden started by Czech '
 'immigrants in Astoria, Queens, and the Casa Amadeo Music Store, the oldest, '
 'continuously occupied Latin music store in New York City,  as census sites '
 'to the National Register of Historic Places.']


In [28]:
%%script echo skipping

input_text = 'Fix grammatical errors in this sentence: When I grow up, I start to understand what he said is quite right.'
input_ids = coedit_large_tokenizer(input_text, return_tensors="pt").input_ids.to(device)
outputs = coedit_large_model.generate(input_ids, max_length=256)
edited_text = coedit_large_tokenizer.decode(outputs[0], skip_special_tokens=True)
edited_text

'When I grow up, I will start to understand what he said is quite right.'

In [35]:
samples = get_iterater_samples(label="fluency", num_samples=2)
pprint(samples)

process_samples = samples

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 2
})


In [36]:
def coedit_large_model_process(item):
    input_ids = coedit_large_tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
    outputs = coedit_large_model.generate(input_ids, max_length=256)
    processed = coedit_large_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"coedit_large_processed": processed}


process_samples = samples.map(coedit_large_model_process, num_proc=torch.cuda.device_count())
pprint(process_samples)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Dataset({
    features: ['task', 'source', 'reference', 'references', 'coedit_large_processed'],
    num_rows: 2
})


In [37]:
def flan_t5_large_model_process(item):
    input_ids = flan_t5_large_tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
    outputs = flan_t5_large_model.generate(input_ids, max_length=256)
    processed = flan_t5_large_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"flan_t5_large_processed": processed}


process_samples = process_samples.map(flan_t5_large_model_process, num_proc=torch.cuda.device_count())
pprint(process_samples)



Map:   0%|          | 0/2 [00:00<?, ? examples/s]



Dataset({
    features: ['task', 'source', 'reference', 'references', 'coedit_large_processed', 'flan_t5_large_processed'],
    num_rows: 2
})


In [39]:
process_samples[:2]

rouge_score = rouge_metric.compute(
    predictions=process_samples['coedit_large_processed'], references=process_samples['references']
)
pprint(rouge_score)
rouge_score = rouge_metric.compute(
    predictions=process_samples['flan_t5_large_processed'], references=process_samples['references']
)
pprint(rouge_score)

sacreblue_score = sacreblue_metric.compute(predictions=process_samples['coedit_large_processed'], references=process_samples['references'])
pprint(sacreblue_score)
sacreblue_score = sacreblue_metric.compute(predictions=process_samples['flan_t5_large_processed'], references=process_samples['references'])
pprint(sacreblue_score)

sari_score = sari_metric.compute(
  sources=process_samples['source'],
  predictions=process_samples['coedit_large_processed'],
  references=process_samples['references']
)
pprint(sari_score)
sari_score = sari_metric.compute(
  sources=process_samples['source'],
  predictions=process_samples['flan_t5_large_processed'],
  references=process_samples['references']
)
pprint(sari_score)

score = em_metric.compute(
    predictions=process_samples['coedit_large_processed'], references=process_samples['reference']
)
pprint(score)
score = em_metric.compute(
    predictions=process_samples['flan_t5_large_processed'], references=process_samples['reference']
)
pprint(score)

{'rouge1': 0.9947368421052631,
 'rouge2': 0.9838709677419355,
 'rougeL': 0.9947368421052631,
 'rougeLsum': 0.9947368421052631}
{'rouge1': 0.9947368421052631,
 'rouge2': 0.9838709677419355,
 'rougeL': 0.9947368421052631,
 'rougeLsum': 0.9947368421052631}
{'bp': 1.0,
 'counts': [112, 107, 102, 97],
 'precisions': [97.3913043478261,
                94.69026548672566,
                91.89189189189189,
                88.9908256880734],
 'ref_len': 113,
 'score': 93.18842817029679,
 'sys_len': 115,
 'totals': [115, 113, 111, 109]}
{'bp': 1.0,
 'counts': [112, 108, 104, 100],
 'precisions': [98.24561403508773,
                96.42857142857143,
                94.54545454545455,
                92.5925925925926],
 'ref_len': 113,
 'score': 95.42978616972013,
 'sys_len': 114,
 'totals': [114, 112, 110, 108]}
{'sari': 61.47430504091642}
{'sari': 74.22333910037895}
{'exact_match': 0.0}
{'exact_match': 0.0}
