In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import os
import nltk.data

In [2]:
nltk.download('punkt_tab')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/zlovoblachko/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [23]:
json_thingie = []

In [24]:
directory = "/home/zlovoblachko/GD_correction_diploma/data/rawfiles"
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.txt') and filename.split(".")[0][-1] == '1':
        with open(os.path.join(directory, filename), 'r') as f:
            text = f.read()
        with open (os.path.join(directory, filename.split(".")[0] + ".ann"), "r") as ann_file:
            ann_data = ann_file.readlines()
    text_code = filename.split(".")[0]
    tags = []
    for line in ann_data:
        if line.startswith("T"):
            try:
                line = line.split("\t")
                ann_code = line[0]
                error_info = line[1].split(" ")
                print
                native_tag = error_info[0]
                if len(native_tag) == 3:
                    raise ValueError
                span_start = error_info[1]
                span_end = error_info[2]
                error_span = line[2][:-1]
                for further_line in ann_data:
                    if further_line.startswith("A"):
                        if ann_code in further_line:
                            correction = ""
                            first_level_tag = "U"
                            break
                    if further_line.startswith("#") and "lemma" not in further_line:
                        if ann_code in further_line:
                            correction = further_line.split("\t")[2].strip()
                            if error_span in correction:
                                first_level_tag = "M"
                            else:
                                first_level_tag = "R"
                            break
                tags.append({'error_span': error_span, 
                             'correction': correction, 
                             'span_start': span_start, 
                             'span_end': span_end, 
                             'native_tag': native_tag, 
                             'first_level_tag': first_level_tag})
            except:
                pass
    json_thingie.append({"text": text, "tags": tags})
            

In [25]:
unique_json_thingie = {each['text']: each for each in json_thingie}.values()
len(unique_json_thingie)

9383

In [26]:
output_filename = "dataset.jsonl"
with open(output_filename, "w", encoding="utf-8") as f:
    for entry in unique_json_thingie:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"Generated {output_filename} with {len(unique_json_thingie)} entries.")

Generated dataset.jsonl with 9383 entries.


In [5]:
jsonl_file = "dataset.jsonl"

In [6]:
with open(jsonl_file, "r", encoding="utf-8") as f:
    training_data = [json.loads(line) for line in f]

In [7]:
training_data[0]

{'text': 'The chart illustrates the number in percents of overweight children in Canada throughout a 20-years period from 1985 to 2005, while the table demonstrates the percentage of children doing sport exercises regulary over the period from 1990 to 2005.\nOverall, it can be seen that despite the fact that the number of boys and girls performing exercises has grown considerably by the end of the period, percent of overweight children has increased too.\nAccording to the graph, boys are more likely to have extra weight in period of 2000-2005, a quater of them had problems with weight in 2005. Girls were going ahead of boys in 1985-1990, then they maintained the same level in 1995, but then the number of outweight boys went up more rapidly.\nThe table allows to see that interest in physical activity has grown by more than 25% both within boys and girls by 2005.',
 'tags': [{'error_span': 'percents',
   'correction': 'percent',
   'span_start': '36',
   'span_end': '44',
   'native_tag'

In [8]:
training_data[0]['text'][int(training_data[0]['tags'][1]['span_start']):int(training_data[0]['tags'][1]['span_end'])]

'20-years'

## SpaCy

In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.tokens import SpanGroup
import random
import json
from collections import Counter
import matplotlib.pyplot as plt
import spacy_transformers
from spacy import displacy

In [9]:
nlp = spacy.blank("en")
output_spacy_file = "dataset.spacy"

In [18]:
doc_bin = DocBin()

In [11]:
counter = 0
for entry in training_data:
    if entry['tags']:
        for mistake in entry['tags']:
             if mistake['first_level_tag']:
                 counter+=1
print(counter)

224532


In [19]:
counter = 0
for entry in training_data:
    try:
        text = entry["text"]
        annotations = entry["tags"]
        doc = nlp(text)
        spans = []
        for ann in annotations:
            start, end = int(ann["span_start"]), int(ann["span_end"])
            label = ann["first_level_tag"]
            span = doc.char_span(start, end, label=label)
            spans.append(span)
        group = SpanGroup(doc, name="sc", spans=spans)
        doc.spans["sc"] = group
        doc_bin.add(doc)
        counter+=len(annotations)
    except:
        pass
doc_bin.to_disk(output_spacy_file)

In [20]:
counter

221364

In [36]:
train_ratio = 0.7
dev_ratio = 0.2
test_ratio = 0.1

In [37]:
total_docs = len(docs)
train_end = int(total_docs * train_ratio)
dev_end = train_end + int(total_docs * dev_ratio)

In [38]:
train_docs = docs[:train_end]
dev_docs = docs[train_end:dev_end]
test_docs = docs[dev_end:]

In [39]:
output_files = {
    "train.spacy": train_docs,
    "dev.spacy": dev_docs,
    "test.spacy": test_docs
}

In [40]:
for filename, doc_subset in output_files.items():
    doc_bin_subset = DocBin()
    for doc in doc_subset:
        doc_bin_subset.add(doc)
    doc_bin_subset.to_disk(filename)

In [41]:
def plot_labels(data: dict):
    labels = []
    sizes = []
    for x, y in data.items():
        labels.append(x)
        sizes.append(y)
    plt.pie(sizes, labels=labels)
    plt.show()

In [42]:
for filename in output_files.keys():
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_disk(filename)
    docs = list(doc_bin.get_docs(nlp.vocab))
    total_docs = len(docs)
    total_tokens = sum(len(doc) for doc in docs)
    total_spans = sum(len(doc.spans.get("sc", [])) for doc in docs)
    first_level_counter = Counter()
    second_level_counter = Counter()
    for doc in docs:
        for span in doc.spans.get("sc", []):
            label = span.label_
            first_level_counter[label] += 1
    print(f"{filename}:")
    print(f"- Total Documents: {total_docs}")
    print(f"- Total Tokens: {total_tokens}")
    print(f"- Avg Tokens per Doc: {total_tokens / total_docs:.2f}")
    print(f"- Total Spans: {total_spans}")
    print(f"- Avg Spans per Doc: {total_spans / total_docs:.2f}")
    print(f"- Tag frequency: {first_level_counter.most_common()}")
    print("-" * 50)

train.spacy:
- Total Documents: 31742
- Total Tokens: 766308
- Avg Tokens per Doc: 24.14
- Total Spans: 62518
- Avg Spans per Doc: 1.97
- Tag frequency: [('R', 43418), ('M', 15250), ('U', 3850)]
--------------------------------------------------
dev.spacy:
- Total Documents: 9069
- Total Tokens: 218357
- Avg Tokens per Doc: 24.08
- Total Spans: 17832
- Avg Spans per Doc: 1.97
- Tag frequency: [('R', 12374), ('M', 4393), ('U', 1065)]
--------------------------------------------------
test.spacy:
- Total Documents: 4536
- Total Tokens: 109640
- Avg Tokens per Doc: 24.17
- Total Spans: 8842
- Avg Spans per Doc: 1.95
- Tag frequency: [('R', 6068), ('M', 2190), ('U', 584)]
--------------------------------------------------


In [30]:
!python -m spacy init config /home/zlovoblachko/diploma/spacy_training/config.cfg --pipeline transformer,spancat --gpu --force

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: spancat
- Optimize for: efficiency
- Hardware: GPU
- Transformer: roberta-base
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/home/zlovoblachko/diploma/spacy_training/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [43]:
!python -m spacy debug data /home/zlovoblachko/diploma/spacy_training/config.cfg

[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: transformer, spancat
6643 training docs
1898 evaluation docs
[38;5;3m⚠ 3 training examples also in evaluation data[0m
[1m
[38;5;4mℹ 1371274 total word(s) in the data (17507 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m

Spans Key   Labels         
---------   ---------------
sc          {'U', 'R', 'M'}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable T