In [1]:
%%script echo skipping

%%capture
%load_ext dotenv
%dotenv
%pip install transformers evaluate
%pip install nltk absl-py rouge_score
%pip install bleu sacrebleu
%pip install bleu sacremoses
!huggingface-cli login --token $HUGGING_FACE_TOKEN


skipping


In [2]:
from datasets import load_dataset
from huggingface_hub import HfApi
from pprint import pprint
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer
import evaluate
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")
# torch.cuda.empty_cache()


'Device: cuda'


### Lading coedit model

In [3]:
coedit_large_tokenizer = AutoTokenizer.from_pretrained("grammarly/coedit-large")
# coedit_large_model = T5ForConditionalGeneration.from_pretrained("grammarly/coedit-large", device_map=0)
coedit_large_model = T5ForConditionalGeneration.from_pretrained("grammarly/coedit-large")
coedit_large_model=coedit_large_model.to(device)

print(f"Allocated: {torch.cuda.memory_allocated(device)/1024**3:.2f} GB")
print(f"Reserved: {torch.cuda.memory_reserved(device)/1024**3:.2f} GB")

Allocated: 3.06 GB
Reserved: 3.06 GB


In [22]:
# %%script echo skipping

import psutil
from pynvml import nvmlInit

prompt = "fix grammar: How is are you?"
input_ids = coedit_large_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
outputs = coedit_large_model.generate(input_ids, max_new_tokens=200)
print(coedit_large_tokenizer.decode(outputs[0], skip_special_tokens=True))


def calculate_utilization():
    nvmlInit()
    total_memory = torch.cuda.get_device_properties(0).total_memory
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    memory_used = info.used
    cuda_allocated = torch.cuda.memory_allocated(device)
    cuda_reserved = torch.cuda.memory_reserved(device)
    ram_usage = psutil.virtual_memory().used
    return {
        "total_memory": total_memory,
        "memory_used": memory_used,
        "cuda_allocated": cuda_allocated,
        "cuda_reserved": cuda_reserved,
        "ram_usage": ram_usage,
    }


def format_utilization(utilization):
    total_memory = f"{utilization["total_memory"]/1024**3:15.2f}"
    memory_used = f"{utilization["memory_used"]/1024**3:15.2f}"
    cuda_allocated = f"{utilization["cuda_allocated"]/1024**3:15.2f}"
    cuda_reserved = f"{utilization["cuda_reserved"]/1024**3:15.2f}"
    ram_usage = f"{utilization["ram_usage"]/(1024**3):15.2f}"

    return {
        "total_memory": total_memory,
        "memory_used": memory_used,
        "cuda_allocated": cuda_allocated,
        "cuda_reserved": cuda_reserved,
        "ram_usage": ram_usage,
    }


def print_utilization_header(utilization):
    print(f"|    total_memory |     memory_used |  cuda_allocated |   cuda_reserved |       ram_usage |")


def print_utilization(utilization):
    utilization_str = format_utilization(utilization)
    print(
        f"| {utilization_str["total_memory"]} | {utilization_str["memory_used"]} | {utilization_str["cuda_allocated"]} | {utilization_str["cuda_reserved"]} | {utilization_str["ram_usage"]} |"
    )


torch.cuda.empty_cache()
utilization = calculate_utilization()
print_utilization_header(utilization)
print_utilization(utilization)

available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]
print(f"Recommended fraction: {recommended_fraction:.2f}")

# torch.cuda.set_per_process_memory_fraction(recommended_fraction, 0)
torch.cuda.set_per_process_memory_fraction(0.95, 0)

How are you?
|    total_memory |     memory_used |  cuda_allocated |   cuda_reserved |       ram_usage |
|           10.00 |            4.58 |            3.07 |            3.08 |           15.91 |
Recommended fraction: 0.54


In [5]:
%%script echo skipping

utilization = calculate_utilization()
print_utilization_header(utilization)
print_utilization(utilization)

torch.cuda.empty_cache()
torch.empty(utilization["total_memory"] // 2, dtype=torch.int8, device='cuda')
# print_utilization()

skipping


### Loading flan-t5

In [6]:
%%script echo skipping

flan_t5_large_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl", return_attention_mask=False)
flan_t5_large_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")
# flan_t5_large_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")
# flan_t5_large_model = flan_t5_large_model.to(device)

print(f"Allocated: {torch.cuda.memory_allocated(device)/1024**3:.2f} GB")
print(f"Reserved: {torch.cuda.memory_reserved(device)/1024**3:.2f} GB")

total_params = sum(p.numel() for p in flan_t5_large_model.parameters())
print(f"Total Parameters: {total_params}")
total_trainable_params = sum(p.numel() for p in flan_t5_large_model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {total_trainable_params}")
# total_memory_GB = total_params * 4 / (1024**3)
# print(f"Estimated model memory: {total_memory_GB:.2f} GB")
# for param_tensor in flan_t5_large_model.state_dict():
#     print(param_tensor, "\t", flan_t5_large_model.state_dict()[param_tensor].size())


skipping


In [7]:
%%script echo skipping

prompt = "translate English to German: How old are you?"
input_ids = flan_t5_large_tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
outputs = flan_t5_large_model.generate(input_ids, max_new_tokens=200)
print(flan_t5_large_tokenizer.decode(outputs[0], skip_special_tokens=True))

skipping


### Datasets

In [8]:
# api = HfApi()
# coedit_info = api.dataset_info("grammarly/coedit")
# pprint(coedit_info)

grammarly_dataset = load_dataset("grammarly/coedit")
pprint(grammarly_dataset)

unique_categories = set(grammarly_dataset)
pprint(unique_categories)

unique_tasks = set(grammarly_dataset["train"]["task"])
pprint(unique_tasks)

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
{'validation', 'train'}
{'coherence', 'clarity', 'neutralize', 'simplification', 'paraphrase', 'gec'}


In [9]:
def get_samples(dataset, category="validation", task="gec", num_samples=1, seed=42):
    return dataset[category].shuffle(seed=seed).filter(lambda item: item["task"] == task).select(range(num_samples))

def print_samples(samples) -> None:
    for item in samples:
        pfx, src = item["src"].split(": ", 1)
        print(f"[{item['task']}] {pfx}")
        print(f"src: {src}")
        print(f"tgt: {item['tgt']}")


print_samples(get_samples(grammarly_dataset, num_samples=2))

# input_ids = coedit_large_tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
# outputs = coedit_large_model.generate(input_ids, max_length=256)
# corrected = coedit_large_tokenizer.decode(outputs[0], skip_special_tokens=True)
# return {"processed": corrected}


[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
[gec] Fix grammaticality in this sentence
src: Dear friends, I hope you should correctly but I can gives you some opinion, I guess that is a good idea if you go to a small schools, under you can met a lot on people and there are more closed friend of course you cannot like that opcion if you like the biggest once, so in that ways you can go from the other school.
tgt: Dear friend, I hope you choose correctly but I can give you my opinion. I guess that it's a good idea if you go to a small school, because you can meet a lot of people and make more close friends of course you won't like that option if you like the bigger one, so in that case you should go to the other school.


### Metrics

#### Rouge metric

In [10]:
rouge_metric = evaluate.load("rouge")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = rouge_metric.compute(
    predictions=samples['src'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'rouge1': 0.7149005682143055,
 'rouge2': 0.5169776674544456,
 'rougeL': 0.6998778809041015,
 'rougeLsum': 0.7008364594050545}


#### _GLUE metric_

In [11]:
glue_metric = evaluate.load("glue", "stsb")

samples = get_samples(grammarly_dataset, task="gec", num_samples=2)
pprint(object=samples)
print_samples([samples[0]])

src_input_ids = coedit_large_tokenizer(samples["src"][0], return_tensors="pt", padding=True).input_ids
tgt_input_ids = coedit_large_tokenizer(samples["tgt"][0], return_tensors="pt", padding=True).input_ids
pprint(src_input_ids[0])
pprint(tgt_input_ids[0])

# score = glue_metric.compute(predictions=src_input_ids[0], references=tgt_input_ids[0])
# score = glue_metric.compute(predictions=samples["src"], references=samples["tgt"])
# pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 2
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
tensor([14269, 19519,    16,    48,  7142,    10,   493,  6195,   388,    55,
            1])
tensor([ 493, 6195,    6,  388,   55,    1])


#### SacreBLEU metric

In [12]:
sacreblue_metric = evaluate.load("sacrebleu")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = sacreblue_metric.compute(predictions=samples["src"], references=samples["tgt"])
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'bp': 1.0,
 'counts': [3886, 2743, 1965, 1419],
 'precisions': [70.79613773000547,
                50.899981443681575,
                37.152580828133864,
                27.346309500867218],
 'ref_len': 5090,
 'score': 43.74251258938969,
 'sys_len': 5489,
 'totals': [5489, 5389, 5289, 5189]}


#### SARI metric

In [13]:
sari_metric = evaluate.load("sari")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

new_samples = samples.map(lambda item: {"tgts": [item["tgt"]]})
new_samples["tgts"][:5]

# sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
# predictions=["About 95 you now get in.","About 95 you now get in."]
# references=[["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."],["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."]]

score = sari_metric.compute(
  sources=new_samples['src'],
  predictions=new_samples['src'],
  references=new_samples['tgts']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'sari': 52.48853096503606}


#### Exact match (EM) metric

In [14]:
em_metric = evaluate.load("exact_match")

samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = em_metric.compute(
    predictions=samples['tgt'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'exact_match': 1.0}


### Datasets

#### IteraTeR
* https://huggingface.co/datasets/wanyu/IteraTeR_v2
* https://huggingface.co/datasets/wanyu/IteraTeR_full_sent

In [15]:
# iterater_dataset = load_dataset("wanyu/IteraTeR_v2") # human in the loop
iterater_dataset = load_dataset("wanyu/IteraTeR_full_sent")
pprint(iterater_dataset)
iterater_validation_dataset = load_dataset("wanyu/IteraTeR_full_sent", split="validation")
pprint(iterater_validation_dataset)
# pprint(iterater_validation_dataset['validation'][0])


DatasetDict({
    train: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 157579
    })
    validation: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 19705
    })
    test: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 19703
    })
})
Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
    num_rows: 19705
})


In [16]:
verbolizers = {
    "gce": {
        "tokens": ["<fluency>"],
        "verbs": [
            "Fix grammar",
            "Fix grammar in this sentence",
            "Fix grammar in the sentence",
            "Fix grammar errors",
            "Fix grammatical errors",
            "Fix grammaticality",
            "Fix all grammatical errors",
            "Fix grammatical errors in this sentence",
            "Fix grammar errors in this sentence",
            "Fix grammatical mistakes in this sentence",
            "Fix grammaticality in this sentence",
            "Fix grammaticality of the sentence",
            "Fix disfluencies in the sentence",
            "Make the sentence grammatical",
            "Make the sentence fluent",
            "Fix errors in this text",
            "Update to remove grammar errors",
            "Remove all grammatical errors from this text",
            "Improve the grammar of this text",
            "Improve the grammaticality",
            "Improve the grammaticality of this text",
            "Improve the grammaticality of this sentence,",
            "Grammar improvements",
            "Remove grammar mistakes",
            "Remove grammatical mistakes",
            "Fix the grammar mistakes",
            "Fix grammatical mistakes",
        ],
    }
}

In [17]:
def substitute_verbolizer(text, verbolizer, count=[0]):
    verbs = verbolizers[verbolizer]["verbs"]

    verb = verbs[count[0]]
    tokens = verbolizers[verbolizer]["tokens"]
    replaced_text = text
    for t in tokens:
        replaced_text = text.replace(t, f"{verb}:")
        # pprint(f"> t: {t}, verb: {verb}, text: {text}, replaced_text: {replaced_text}")

    count[0] += 1
    if count[0] >= len(verbs):
        count[0] = 0

    return replaced_text


def get_iterater_samples(label, category="validation", num_samples=0, seed=42, confidence_threshold=0.9):
    filtered_samples = (
        iterater_dataset[category]
        .shuffle(seed=seed)
        .filter(lambda item: item["labels"] == label and float(item["confidence"]) >= confidence_threshold)
    )
    max_samples = len(filtered_samples)
    selected = max_samples if num_samples == 0 else num_samples
    print(f"max_samples: {max_samples}, selected: {selected}, num_samples: {num_samples}")
    samples = filtered_samples.select(range(selected))

    return samples.map(
        lambda item: {
            "task": substitute_verbolizer(item["before_sent_with_intent"], "gce"),
            "source": item["before_sent"],
            "reference": item["after_sent"],
            "references": [item["after_sent"]],
        },
        remove_columns=[
            "before_sent_with_intent",
            "before_sent",
            "after_sent",
            "labels",
            "confidence",
            "doc_id",
            "revision_depth",
        ],
    )


samples = get_iterater_samples(label="fluency", num_samples=5)
pprint(samples)
pprint(samples["task"][:2])

max_samples: 5078, selected: 5, num_samples: 5
Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 5
})
["Fix grammar:  We don't have enough good Open Source games -- it's a waste to "
 'pour all the resources we have into one. :) Wesnoth has dwarves with guns, '
 "World of Warcraft'' has gnomes and goblins with explosives and flying "
 'machines -- where do you, personally, define the limits of the fantasy '
 'genre?',
 'Fix grammar in this sentence:  In 2001, they successfully nominated Bohemian '
 'Hall, still a vibrant community center/beer garden started by Czech '
 'immigrants in Astoria, Queens, and the Casa Amadeo Music Store, the oldest, '
 'continuously occupied Latin music store in New York City,  as census sites '
 'to the National Register of Historic Places.']


### GPU processing

In [25]:
%%time

samples = get_iterater_samples(label="fluency", num_samples=1000)
# samples = get_iterater_samples(label="fluency")
pprint(samples)

process_samples = samples


def coedit_large_model_process(batch):
    input_ids = coedit_large_tokenizer(batch["task"], padding=True, return_tensors="pt").input_ids.to(device)
    # input_ids = coedit_large_tokenizer(item["task"], return_tensors="pt").input_ids
    outputs = coedit_large_model.generate(input_ids, max_length=256)
    # print(f"outputs: {outputs}")
    processed = coedit_large_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    utilization = calculate_utilization()
    print_utilization(utilization)

    return {"processed": processed}

torch.cuda.empty_cache()
utilization = calculate_utilization()
print_utilization_header(utilization)
print_utilization(utilization)

# process_samples = samples.map(coedit_large_model_process, num_proc=torch.cuda.device_count())
process_samples = samples.map(coedit_large_model_process, num_proc=1, batched=True, batch_size=20)
pprint(process_samples)
pprint(process_samples["processed"][:2])

max_samples: 5078, selected: 1000, num_samples: 1000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 1000
})
|    total_memory |     memory_used |  cuda_allocated |   cuda_reserved |       ram_usage |
|           10.00 |            4.56 |            3.07 |            3.08 |           15.82 |


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

|           10.00 |            6.13 |            3.07 |            4.65 |           15.81 |
|           10.00 |            7.24 |            3.07 |            5.80 |           15.80 |
|           10.00 |            7.24 |            3.07 |            5.80 |           15.80 |
|           10.00 |            8.91 |            3.07 |            7.47 |           15.81 |
|           10.00 |            8.91 |            3.07 |            7.47 |           15.82 |
|           10.00 |            8.91 |            3.07 |            7.47 |           15.81 |
|           10.00 |            8.91 |            3.07 |            7.47 |           15.81 |
|           10.00 |            8.91 |            3.07 |            7.47 |           15.81 |
|           10.00 |            8.91 |            3.07 |            7.47 |           15.81 |
|           10.00 |            9.22 |            3.07 |            7.77 |           15.80 |
|           10.00 |            9.20 |            3.07 |            7.77 |       

### GPU processing

In [37]:
%%script echo skipping

def flan_t5_large_model_process(item):
    input_ids = flan_t5_large_tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
    outputs = flan_t5_large_model.generate(input_ids, max_length=256)
    processed = flan_t5_large_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"flan_t5_large_processed": processed}


process_samples = process_samples.map(flan_t5_large_model_process, num_proc=torch.cuda.device_count())
pprint(process_samples)



Map:   0%|          | 0/2 [00:00<?, ? examples/s]



Dataset({
    features: ['task', 'source', 'reference', 'references', 'coedit_large_processed', 'flan_t5_large_processed'],
    num_rows: 2
})


In [1]:
%%time

process_samples[:2]

rouge_score = rouge_metric.compute(
    predictions=process_samples['coedit_large_processed'], references=process_samples['references']
)
pprint(rouge_score)
# rouge_score = rouge_metric.compute(
#     predictions=process_samples['flan_t5_large_processed'], references=process_samples['references']
# )
# pprint(rouge_score)

sacreblue_score = sacreblue_metric.compute(predictions=process_samples['coedit_large_processed'], references=process_samples['references'])
pprint(sacreblue_score)
# sacreblue_score = sacreblue_metric.compute(predictions=process_samples['flan_t5_large_processed'], references=process_samples['references'])
# pprint(sacreblue_score)

sari_score = sari_metric.compute(
  sources=process_samples['source'],
  predictions=process_samples['coedit_large_processed'],
  references=process_samples['references']
)
pprint(sari_score)
# sari_score = sari_metric.compute(
#   sources=process_samples['source'],
#   predictions=process_samples['flan_t5_large_processed'],
#   references=process_samples['references']
# )
# pprint(sari_score)

score = em_metric.compute(
    predictions=process_samples['coedit_large_processed'], references=process_samples['reference']
)
pprint(score)
# score = em_metric.compute(
#     predictions=process_samples['flan_t5_large_processed'], references=process_samples['reference']
# )
# pprint(score)

NameError: name 'process_samples' is not defined