In [90]:
import os

import torch

from transformers import pipeline
from tqdm import tqdm

In [91]:
PROMPTS = {
    "en": "Edit the text for spelling and grammar mistakes. " \
               + "Do not paraphrase the text. Correct just evident mistakes. Do not explain anything. " \
               + "Do not change numbers to letters. Return only the corrected text. " \
               + "Example. Text: The modern techonlogy is better now than what it used to be. " \
               + "Correct text: The modern technology is better now than it used to be. " \
               + "Text: __TEXT_PLACEHOLDER__ Correct text: ",
    "de": "Bearbeiten Sie den Text auf Rechtschreib- und Grammatikfehler. " \
               + "Paraphrasieren Sie den Text nicht. Korrigieren Sie nur offensichtliche Fehler. Erklären Sie nichts. " \
               + "Ändern Sie keine Zahlen in Buchstaben. Geben Sie nur den korrigierten Text zurück. " \
               + "Beispiel. Text: Die moderne Technik ist heute besser als früher. " \
               + "Richtiger Text: Die moderne Technik ist heute besser als früher. " \
               + "Text: __TEXT_PLACEHOLDER__ Richtiger Text: "
}

In [92]:
class LanguageModelGec:

    def __init__(self, p, lang):
        self.text_generation_pipeline = p
        self.prompt_text = PROMPTS[lang]

    def get_conversation_template(self, prompt):
        return [
            {
                "role": "user",
                "content": prompt,
            }
        ]

    def get_prompt(self, text):
        return self.prompt_text.replace("__TEXT_PLACEHOLDER__", text)

    def __call__(self, text, params, debug=False):
        prompt = self.get_prompt(text)
        messages = self.get_conversation_template(prompt)
        inputs = self.text_generation_pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = self.text_generation_pipeline(inputs, **params)
        if debug:
            print(messages)
            print()
            print(inputs)
            print()
            print(outputs)
            print()
        return outputs[0]["generated_text"].lstrip()

In [93]:
class LanguageModelGecFactory:

    def __init__(self):
        self.model_cache = {}

    def get_pipeline(self, model_name, torch_dtype, device):
        if model_name in self.model_cache:
            return self.model_cache[model_name]
        else:
            p = pipeline(
                "text-generation", 
                model=model_name,
                torch_dtype=torch_dtype,
                device=device,
            )
            self.model_cache[model_name] = p
            return p

    def create(self, model_name, lang, torch_dtype=torch.bfloat16, device='cuda'):
        p = self.get_pipeline(model_name, torch_dtype, device)
        return LanguageModelGec(p, lang)
        

In [6]:
gec_factory = LanguageModelGecFactory()

In [94]:
model_name = "Unbabel/TowerInstruct-7B-v0.2"

In [95]:
gec_factory.create(model_name, 'en')

<__main__.LanguageModelGec at 0x7984363036d0>

In [96]:
gec_factory.model_cache

{'Unbabel/TowerInstruct-7B-v0.2': <transformers.pipelines.text_generation.TextGenerationPipeline at 0x7987a7e0d970>}

### Parameters

In [97]:
params = {
    "do_sample": True,
    "repetition_penalty": 1.18,
    "max_new_tokens": 500,
    "top_k": 40,
    "top_p": 0.1,
    "temperature": 0.1,
    "return_full_text": False,
}

### Sandbox

In [12]:
gec = gec_factory.create(model_name, 'en')

In [13]:
messages = gec.get_conversation_template(gec.get_prompt("It became scencial to do my homework."))
messages

[{'role': 'user',
  'content': 'Edit the text for spelling and grammar mistakes. Do not paraphrase the text. Correct just evident mistakes. Do not explain anything. Do not change numbers to letters. Return only the corrected text. Example. Text: The modern techonlogy is better now than what it used to be. Correct text: The modern technology is better now than it used to be. Text: It became scencial to do my homework. Correct text: '}]

In [14]:
inputs = gec.text_generation_pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs

'<|im_start|>user\nEdit the text for spelling and grammar mistakes. Do not paraphrase the text. Correct just evident mistakes. Do not explain anything. Do not change numbers to letters. Return only the corrected text. Example. Text: The modern techonlogy is better now than what it used to be. Correct text: The modern technology is better now than it used to be. Text: It became scencial to do my homework. Correct text: <|im_end|>\n<|im_start|>assistant\n'

In [15]:
outputs = gec.text_generation_pipeline(inputs, **params)

In [16]:
outputs

[{'generated_text': ' It has become essential to do my homework.'}]

In [17]:
gec("It became scencial to do my homework.", params)

'It has become essential to do my homework.'

### Experiments set up

In [98]:
DATA_PATH = "geceval/data/raw/"

In [99]:
def read_text(idx, lang):
    base_path = DATA_PATH + lang + "/"
    file_name = f"correct_{idx}.txt"
    if os.path.exists(base_path + file_name):
        with open(base_path + file_name) as f:
            return file_name, f.read()
    else:
        file_name = "in" + file_name
        with open(base_path + file_name) as f:
            return file_name, f.read()

In [100]:
def prepare_single_example(idx, lang):
    file_name, file_content = read_text(idx, lang)
    return {
        "file_name": file_name,
        "original_text": file_content,
    }

In [101]:
def get_examples(idxes, lang):
    examples = []
    for i in idxes:
        examples.append(prepare_single_example(i, lang))
    return examples

In [102]:
def process(examples, gec, params):
    for example in tqdm(examples):
        if "corrected_text" not in example:
            text = example["original_text"]
            example["corrected_text"] = gec(text, params)

In [103]:
def save_examples(examples, lang):
    base_path = DATA_PATH + lang + "/"
    for example in tqdm(examples):
        file_name = example["file_name"]
        inference_file_name = example["file_name"].replace(".txt", "") + "_inference.txt"
        original_text = example["original_text"]
        corrected_text = example["corrected_text"]

        if not os.path.exists(base_path + file_name):
            with open(base_path + file_name, 'w') as f:
                f.write(original_text)
        if not os.path.exists(base_path + inference_file_name):
            with open(base_path + inference_file_name, 'w') as f:
                f.write(corrected_text)

In [104]:
# import random

# random.seed(23)
# indexes = set.union({random.randrange(0, 500) for _ in range(100)}, {random.randrange(1000, 2000) for _ in range(100)})

In [105]:
# len(indexes)

### experiment (en)

In [106]:
gec_en = gec_factory.create(model_name, 'en')

In [107]:
gec_en.prompt_text

'Edit the text for spelling and grammar mistakes. Do not paraphrase the text. Correct just evident mistakes. Do not explain anything. Do not change numbers to letters. Return only the corrected text. Example. Text: The modern techonlogy is better now than what it used to be. Correct text: The modern technology is better now than it used to be. Text: __TEXT_PLACEHOLDER__ Correct text: '

In [108]:
examples_en = get_examples(list(range(2191)), 'en')

In [109]:
examples_en[:5]

[{'file_name': 'correct_0.txt', 'original_text': '13th June 2000'},
 {'file_name': 'correct_1.txt', 'original_text': 'Dear Ms Helen Ryan'},
 {'file_name': 'correct_2.txt', 'original_text': 'Competition Organiser'},
 {'file_name': 'correct_3.txt',
  'original_text': 'I am therefore writing to give you my further information.'},
 {'file_name': 'correct_4.txt',
  'original_text': 'First of all, I am a student and would like to travel in July.'}]

In [80]:
process(examples_en, gec_en, params)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 189/189 [02:00<00:00,  1.56it/s]


In [81]:
examples_en[:5]

[{'file_name': 'incorrect_1541.txt',
  'original_text': 'I though she was a really good friend but I made a mistake.',
  'corrected_text': 'I thought she was a really good friend but I made a mistake.'},
 {'file_name': 'correct_7.txt',
  'original_text': 'Therefore I would like to choose basketball and tennis.',
  'corrected_text': 'Therefore, I would like to choose basketball and tennis.'},
 {'file_name': 'correct_8.txt',
  'original_text': 'I would be most grateful if you could give me further advicable information.',
  'corrected_text': 'I would be most grateful if you could provide me with more advice.'},
 {'file_name': 'correct_10.txt',
  'original_text': 'Yours sincerely,',
  'corrected_text': 'Sincerely,'},
 {'file_name': 'incorrect_1034.txt',
  'original_text': 'In my opinion it would be nice if you create an exhibition place where the guests can try to play different instruments.',
  'corrected_text': 'In my opinion, it would be nice if you created a showcase area where guests

In [82]:
save_examples(examples_en, 'en')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 189/189 [00:00<00:00, 869.93it/s]


### Experiment (de)

In [83]:
gec_de = gec_factory.create(model_name, 'de')

In [84]:
gec_de.prompt_text

'Bearbeiten Sie den Text auf Rechtschreib- und Grammatikfehler. Paraphrasieren Sie den Text nicht. Korrigieren Sie nur offensichtliche Fehler. Erklären Sie nichts. Ändern Sie keine Zahlen in Buchstaben. Geben Sie nur den korrigierten Text zurück. Beispiel. Text: Die moderne Technik ist heute besser als früher. Richtiger Text: Die moderne Technik ist heute besser als früher. Text: __TEXT_PLACEHOLDER__ Richtiger Text: '

In [85]:
examples_de = get_examples(indexes, 'de')

In [86]:
examples_de[:5]

[{'file_name': 'incorrect_1541.txt',
  'original_text': 'Viele Männer sind gegen Feminismus, weil sie Frauen ohne die Kleidung mogen, und diese Meinung ist schwer zu verändern!'},
 {'file_name': 'correct_7.txt',
  'original_text': 'Wenn man darauf nicht vorbereitet ist, kann man nicht richtig arbeiten.'},
 {'file_name': 'correct_8.txt',
  'original_text': 'Zum Beispiel kommt ein deutscher Professor nach Stellenbosch und kann nicht Afrikaans oder Englisch sprechen.'},
 {'file_name': 'correct_10.txt',
  'original_text': 'Aber darauf bin ich nicht vorbereitet.'},
 {'file_name': 'incorrect_1034.txt',
  'original_text': 'In vielen Länder der EU kommt das Ausbildungssystem oft zum Vorgrund der politischen und sozialischen Debatten.'}]

In [87]:
process(examples_de, gec_de, params)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 189/189 [02:51<00:00,  1.10it/s]


In [88]:
examples_de[:5]

[{'file_name': 'incorrect_1541.txt',
  'original_text': 'Viele Männer sind gegen Feminismus, weil sie Frauen ohne die Kleidung mogen, und diese Meinung ist schwer zu verändern!',
  'corrected_text': 'Viele Männer sind gegen Feminismus, weil sie Frauen ohne das Kleidungsstück mögen, und diese Meinung ist schwer zu ändern!'},
 {'file_name': 'correct_7.txt',
  'original_text': 'Wenn man darauf nicht vorbereitet ist, kann man nicht richtig arbeiten.',
  'corrected_text': 'Wenn man sich nicht darauf vorbereitet hat, kann man nicht richtig arbeiten.'},
 {'file_name': 'correct_8.txt',
  'original_text': 'Zum Beispiel kommt ein deutscher Professor nach Stellenbosch und kann nicht Afrikaans oder Englisch sprechen.',
  'corrected_text': 'Zum Beispiel kommt ein deutscher Professor nach Stellenbosch und kann weder Afrikaans noch Englisch sprechen.'},
 {'file_name': 'correct_10.txt',
  'original_text': 'Aber darauf bin ich nicht vorbereitet.',
  'corrected_text': 'Aber darauf warte ich nicht.'},
 {

In [89]:
save_examples(examples_de, 'de')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 189/189 [00:00<00:00, 11960.76it/s]
