In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

In [2]:
def get_text_list(file_path):
    with open(file_path, 'rb') as f:
        texts = pd.read_csv(f, sep='\t', encoding='utf-16 LE')
    filtered_texts = texts.loc[texts['text_graph_desc'] == True, ['text_name', 'text_year', 'text_id']].copy()
    filtered_texts['year'] = filtered_texts['text_year']
    if 2021 in filtered_texts['year'].unique():
        print(file_path)
    return filtered_texts.reset_index(drop=True)

In [3]:
paths = ['/home/zlovoblachko/local_realec/Exam2014/Task 1/Exam2014_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2015/Task 1/Exam2015_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2016/Task 1/Exam2016_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2017/Task 1/Exam2017_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2019/Task 1/Exam2019_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 1/Exam2020_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 2/Exam2020_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 3/Exam2020_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 4/Exam2020_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 5/Exam2020_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 6/Exam2020_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 7/Exam2020_text_table.tsv',
         '/home/zlovoblachko/local_realec/Exam2020/Task 8/Exam2020_text_table.tsv']

relevant_texts_nums = pd.DataFrame()

for path in paths:
    relevant_texts_nums = pd.concat([relevant_texts_nums, get_text_list(path)])

/home/zlovoblachko/local_realec/Exam2020/Task 1/Exam2020_text_table.tsv


In [4]:
relevant_texts_nums

Unnamed: 0,text_name,text_year,text_id,year
0,2014_AAl_10_1.txt,2014,1,2014
1,2014_AAl_11_1.txt,2014,3,2014
2,2014_AAl_12_1.txt,2014,5,2014
3,2014_AAl_13_1.txt,2014,7,2014
4,2014_AAl_14_1.txt,2014,9,2014
...,...,...,...,...
224,2020_MLa_6404_1.txt,2020,447,2020
225,2020_MLa_6405_1.txt,2020,449,2020
226,2020_MLa_6406_1.txt,2020,451,2020
227,2020_MLa_6407_1.txt,2020,453,2020


In [5]:
path_mistakes = "/home/zlovoblachko/diploma/Labelled_dataset.tsv"

In [6]:
with open(path_mistakes, "rb") as f:
    mistakes_df = pd.read_csv(f, sep='\t')

In [7]:
mistakes_df['year'] = mistakes_df['year'].apply(lambda row: row.split('_')[0][4:8])

In [8]:
mistakes_df['year'] = mistakes_df['year'].astype(int)

In [9]:
mistakes_df.head(10)

Unnamed: 0,level_0,index,mistake_id,text_id,sentence_id,mistake_type,error_span,correction,span_start,span_end,year,bigger_code,first_level_tag,second_level_tag
0,4,7,8,1,2,Absence_explanation,century,XXth century,200,207,2014,Discourse,M,MULTIWORD
1,6,10,11,1,4,Absence_explanation,a population,the Japanese population,436,448,2014,Discourse,R,MULTIWORD
2,7,11,12,1,4,Tense_choice,would be,is going to be,449,457,2014,Grammar,R,VERB
3,8,12,13,1,4,Punctuation,",",-,482,483,2014,Punctuation,R,PUNCT
4,10,16,17,1,5,Absence_explanation,century,XXth century,583,590,2014,Discourse,M,MULTIWORD
5,12,19,20,1,5,Tense_choice,would be,is expected to be,654,662,2014,Grammar,R,VERB
6,13,20,21,1,5,Tense_choice,would be,are going to be,696,704,2014,Grammar,R,VERB
7,14,21,22,1,6,Word_choice,Sweden chart,The graph for Sweden,720,732,2014,Vocabulary,M,MULTIWORD
8,15,22,23,1,6,Ref_device,the US one,that of the USA,752,762,2014,Discourse,M,WORD
9,17,24,25,1,7,Formational_affixes,tendention,tendency,868,878,2014,Vocabulary,R,FORM


In [10]:
len(mistakes_df)

45076

In [11]:
mistakes_df['span_start'] = mistakes_df['span_start'].astype(int)

In [12]:
relevant_texts_nums[relevant_texts_nums['year'] == 2021]

Unnamed: 0,text_name,text_year,text_id,year
909,2021_MLa_100_1.txt,2021,1820,2021
910,2021_MLa_101_1.txt,2021,1822,2021
911,2021_MLa_102_1.txt,2021,1824,2021
912,2021_MLa_10_1.txt,2021,1826,2021
913,2021_MLa_11_1.txt,2021,1828,2021
...,...,...,...,...
1006,2021_MLa_96_1.txt,2021,2014,2021
1007,2021_MLa_97_1.txt,2021,2016,2021
1008,2021_MLa_98_1.txt,2021,2018,2021
1009,2021_MLa_99_1.txt,2021,2020,2021


In [13]:
training_data = []
mistake_counter = 0
ann_len = 0
for year in relevant_texts_nums['year'].unique():
    print(year)
    year_text_df = relevant_texts_nums[relevant_texts_nums['year'] == year]
    year_mistakes_df = mistakes_df[mistakes_df['year'] == year]
    for _, text_row in year_text_df.iterrows():
        text_mistakes_df = year_mistakes_df[year_mistakes_df['text_id'] == text_row['text_id']]
        text_mistakes_df = text_mistakes_df.sort_values('span_start')
        try:
            with open('/home/zlovoblachko/diploma/data/rawfiles/' + text_row['text_name']) as f:
                text = f.read()
        except FileNotFoundError:
            ## print(text_row['text_name'])
            pass
        annotations = []
        for _, mistake_row in text_mistakes_df.iterrows():
                annotations.append({
                    "start": mistake_row['span_start'],
                    "end": mistake_row['span_end'],
                    "error_span": text[mistake_row['span_start']:mistake_row['span_end']],
                    "first_level_tag": str(mistake_row["first_level_tag"]),
                    "second_level_tag": str(mistake_row["second_level_tag"]),
                    "correction": mistake_row["correction"]
                })
                mistake_counter += 1
        training_data.append({"text": text, "annotations": annotations})
        ann_len += len(annotations)

2014
2015
2016
2017
2019
2020
2021


In [14]:
print(mistake_counter)
print(ann_len)

100939
100939


In [15]:
training_data[0]

{'text': 'Given diagram visualize the proportion of population aged 65 and over in Japan, Sweden and the United States from 1940 to 2040. \nIn all of the countries, the proportion was growing rapidly during the century, with Japan being an exception, where it was stable from 1940 to 1980, decreasing slightly from 5% to about 3% in 1960. \nAfter a stable period, we see a huge incline that would occur in Japan from 2020 to 2040. About a third of a population would be aged 65 and over in 2040, compare that to below 5% part in 2000. \nUSA, however, had bigger part of old people throughout the century, having 10% in 1960 and even 15% in 1980, but the overall part would be below Japanese in 2040, whew 25% would be 65 and older. \nSweden chart is almost equal to the US one, only big discrepancy is 20% in 2020 in Sweden versus below 15% in the US. \nOverall, we can see a strong tendention that population is getting older at a whole with time in these countries.',
 'annotations': [{'start': 0,


In [16]:
entry = training_data[-5675]
print(entry['text'])
for anns in entry['annotations']:
    print(anns['start'], anns['end'])
    print(entry['text'][anns['start']:anns['end']])
print('---' * 50)

At the first glance we can notice that men are more active users of Facebook than women. Only in receiving updates percentage of men is 30 percent and it equals to percentage of women. As we can see women use Facebook on the mobile phones more often than men but not as often as on desktop. However women use facebook on their leptops an computers rarely than men. The number of the quantity of men using Facebook on desktops devided by the same quantity of women is about 6/7. This number is definately less than the number of the quantity of women using Facebook on their phones devided by the same quantity of men. That means that women like to use Facebook mobile version while they have been doing other buisenesses.
Men more often sharing video on about 15 percent. Also men watch funny videos more often.
So i think that such diagrams demonstrate us that men use Facebook previously for joy. Women use Facebook for an entertainment rarely but more often use it for a work.
3 19
the first glanc

In [17]:
output_filename = "Textwise_training_data.jsonl"

with open(output_filename, "w", encoding="utf-8") as f:
    for entry in training_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Generated {output_filename} with {len(training_data)} entries.")

Generated Textwise_training_data.jsonl with 9490 entries.


## Предложения

In [18]:
import nltk.data
nltk.download('punkt_tab')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/zlovoblachko/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [22]:
sent_training_data = []
mistake_counter_sent = 0
ann_len = 0
for year in relevant_texts_nums['year'].unique():
    year_text_df = relevant_texts_nums[relevant_texts_nums['year'] == year]
    year_mistakes_df = mistakes_df[mistakes_df['year'] == year]
    for _, text_row in year_text_df.iterrows():
        text_mistakes_df = year_mistakes_df[year_mistakes_df['text_id'] == text_row['text_id']]
        text_mistakes_df = text_mistakes_df.sort_values('span_start')
        try:
            with open('/home/zlovoblachko/diploma/data/rawfiles/' + text_row['text_name']) as f:
                text = f.read()
                sentence_list = tokenizer.tokenize(text)
        except FileNotFoundError:
            print(text_row['text_name'])
        current_symcount_before = 0
        current_symcount_after = 0
        for sentence in sentence_list:
            current_symcount_after += len(sentence)
            annotations = []
            for _, mistake_row in text_mistakes_df.iterrows():
                    if current_symcount_before <= mistake_row['span_start'] <= current_symcount_after:
                        annotations.append({
                            "start": mistake_row['span_start'] - current_symcount_before,
                            "end": mistake_row['span_end'] - current_symcount_before,
                            "error_span": text[mistake_row['span_start'] - current_symcount_before:mistake_row['span_end'] - current_symcount_before],
                            "first_level_tag": str(mistake_row["first_level_tag"]),
                            "second_level_tag": str(mistake_row["second_level_tag"]),
                            "correction": mistake_row["correction"]
                        })
                        mistake_counter_sent += 1
            current_symcount_before += len(sentence)
            if len(annotations) > 0:
                sent_training_data.append({"text": sentence, "annotations": annotations})

2021_MLa_100_1.txt
2021_MLa_101_1.txt
2021_MLa_102_1.txt
2021_MLa_10_1.txt
2021_MLa_11_1.txt
2021_MLa_12_1.txt
2021_MLa_13_1.txt
2021_MLa_14_1.txt
2021_MLa_15_1.txt
2021_MLa_16_1.txt
2021_MLa_17_1.txt
2021_MLa_18_1.txt
2021_MLa_19_1.txt
2021_MLa_1_1.txt
2021_MLa_20_1.txt
2021_MLa_21_1.txt
2021_MLa_22_1.txt
2021_MLa_23_1.txt
2021_MLa_24_1.txt
2021_MLa_25_1.txt
2021_MLa_26_1.txt
2021_MLa_27_1.txt
2021_MLa_28_1.txt
2021_MLa_29_1.txt
2021_MLa_2_1.txt
2021_MLa_30_1.txt
2021_MLa_31_1.txt
2021_MLa_32_1.txt
2021_MLa_33_1.txt
2021_MLa_34_1.txt
2021_MLa_35_1.txt
2021_MLa_36_1.txt
2021_MLa_37_1.txt
2021_MLa_38_1.txt
2021_MLa_39_1.txt
2021_MLa_3_1.txt
2021_MLa_40_1.txt
2021_MLa_41_1.txt
2021_MLa_42_1.txt
2021_MLa_43_1.txt
2021_MLa_44_1.txt
2021_MLa_45_1.txt
2021_MLa_46_1.txt
2021_MLa_47_1.txt
2021_MLa_48_1.txt
2021_MLa_49_1.txt
2021_MLa_4_1.txt
2021_MLa_50_1.txt
2021_MLa_51_1.txt
2021_MLa_52_1.txt
2021_MLa_53_1.txt
2021_MLa_54_1.txt
2021_MLa_55_1.txt
2021_MLa_56_1.txt
2021_MLa_57_1.txt
2021_MLa_58

In [23]:
mistake_counter_sent

96397

In [25]:
sent_training_data[1001]

{'text': "Tokyo's underground system is standing out as the one that transports the greatest amount of passengers per year (1927 millions).",
 'annotations': [{'start': 124,
   'end': 132,
   'error_span': 'le that ',
   'first_level_tag': 'R',
   'second_level_tag': 'NUM',
   'correction': 'million'}]}

In [26]:
output_filename = "Sentencewise_training_data.jsonl"

with open(output_filename, "w", encoding="utf-8") as f:
    for entry in sent_training_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Generated {output_filename} with {len(sent_training_data)} entries.")

Generated Sentencewise_training_data.jsonl with 45347 entries.


## SpaCy

In [27]:
import spacy
from spacy.tokens import DocBin
import random
import json
from collections import Counter
import matplotlib.pyplot as plt
import spacy_transformers

In [29]:
nlp = spacy.blank("en")
jsonl_file = "Sentencewise_training_data.jsonl"
output_spacy_file = "Sentencewise_training_data.spacy"

In [30]:
with open(jsonl_file, "r", encoding="utf-8") as f:
    training_data = [json.loads(line) for line in f]

doc_bin = DocBin()

In [31]:
counter = 0
for entry in training_data:
    if entry['annotations']:
        for mistake in entry['annotations']:
             if mistake['first_level_tag']:
                 counter+=1
print(counter)

96397


In [32]:
counter = 0
for entry in training_data:
    text = entry["text"]
    annotations = entry["annotations"]
    doc = nlp.make_doc(text)
    spans = []
    for ann in annotations:
        start, end = ann["start"], ann["end"]
        label = ann["first_level_tag"]
        span = doc.char_span(start, end, label=label, alignment_mode="expand")
        if span:
            spans.append(span)
            counter += 1
    doc.spans["sc"] = spans
    doc_bin.add(doc)
doc_bin.to_disk(output_spacy_file)

In [33]:
counter

89192

In [34]:
nlp = spacy.blank("en")
doc_bin = DocBin().from_disk(output_spacy_file)
docs = list(doc_bin.get_docs(nlp.vocab))

In [35]:
random.seed(42)
random.shuffle(docs)

In [36]:
train_ratio = 0.7
dev_ratio = 0.2
test_ratio = 0.1

In [37]:
total_docs = len(docs)
train_end = int(total_docs * train_ratio)
dev_end = train_end + int(total_docs * dev_ratio)

In [38]:
train_docs = docs[:train_end]
dev_docs = docs[train_end:dev_end]
test_docs = docs[dev_end:]

In [39]:
output_files = {
    "train.spacy": train_docs,
    "dev.spacy": dev_docs,
    "test.spacy": test_docs
}

In [40]:
for filename, doc_subset in output_files.items():
    doc_bin_subset = DocBin()
    for doc in doc_subset:
        doc_bin_subset.add(doc)
    doc_bin_subset.to_disk(filename)

In [41]:
def plot_labels(data: dict):
    labels = []
    sizes = []
    for x, y in data.items():
        labels.append(x)
        sizes.append(y)
    plt.pie(sizes, labels=labels)
    plt.show()

In [42]:
for filename in output_files.keys():
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_disk(filename)
    docs = list(doc_bin.get_docs(nlp.vocab))
    total_docs = len(docs)
    total_tokens = sum(len(doc) for doc in docs)
    total_spans = sum(len(doc.spans.get("sc", [])) for doc in docs)
    first_level_counter = Counter()
    second_level_counter = Counter()
    for doc in docs:
        for span in doc.spans.get("sc", []):
            label = span.label_
            first_level_counter[label] += 1
    print(f"{filename}:")
    print(f"- Total Documents: {total_docs}")
    print(f"- Total Tokens: {total_tokens}")
    print(f"- Avg Tokens per Doc: {total_tokens / total_docs:.2f}")
    print(f"- Total Spans: {total_spans}")
    print(f"- Avg Spans per Doc: {total_spans / total_docs:.2f}")
    print(f"- Tag frequency: {first_level_counter.most_common()}")
    print("-" * 50)

train.spacy:
- Total Documents: 31742
- Total Tokens: 766308
- Avg Tokens per Doc: 24.14
- Total Spans: 62518
- Avg Spans per Doc: 1.97
- Tag frequency: [('R', 43418), ('M', 15250), ('U', 3850)]
--------------------------------------------------
dev.spacy:
- Total Documents: 9069
- Total Tokens: 218357
- Avg Tokens per Doc: 24.08
- Total Spans: 17832
- Avg Spans per Doc: 1.97
- Tag frequency: [('R', 12374), ('M', 4393), ('U', 1065)]
--------------------------------------------------
test.spacy:
- Total Documents: 4536
- Total Tokens: 109640
- Avg Tokens per Doc: 24.17
- Total Spans: 8842
- Avg Spans per Doc: 1.95
- Tag frequency: [('R', 6068), ('M', 2190), ('U', 584)]
--------------------------------------------------


In [30]:
!python -m spacy init config /home/zlovoblachko/diploma/spacy_training/config.cfg --pipeline transformer,spancat --gpu --force

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: spancat
- Optimize for: efficiency
- Hardware: GPU
- Transformer: roberta-base
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/home/zlovoblachko/diploma/spacy_training/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [43]:
!python -m spacy debug data /home/zlovoblachko/diploma/spacy_training/config.cfg

[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: transformer, spancat
6643 training docs
1898 evaluation docs
[38;5;3m⚠ 3 training examples also in evaluation data[0m
[1m
[38;5;4mℹ 1371274 total word(s) in the data (17507 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m

Spans Key   Labels         
---------   ---------------
sc          {'U', 'R', 'M'}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable T