This file is used to create training data from raw corpus data.

In [2]:
import re
import os
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize

In [23]:
corpus_tags = set(open('corpus_tags.txt', encoding='utf-8').read().split('\n'))
len(corpus_tags), corpus_tags

(46,
 {'Absence_comp_sent',
  'Absence_explanation',
  'Adj_as_collective',
  'Adverbs',
  'Agreement_errors',
  'Articles',
  'Capitalisation',
  'Category_confusion',
  'Coherence',
  'Comparative_constr',
  'Comparison_degree',
  'Compound_word',
  'Confusion_of_structures',
  'Conjunctions',
  'Countable_uncountable',
  'Determiners',
  'Formational_affixes',
  'Inappropriate_register',
  'Infinitive_constr',
  'Lack_par_constr',
  'Linking_device',
  'Modals',
  'Negation',
  'Noun_inf',
  'Noun_number',
  'Numerals',
  'Participial_constr',
  'Possessive',
  'Prepositional_adjective',
  'Prepositional_adv',
  'Prepositional_noun',
  'Prepositions',
  'Pronouns',
  'Punctuation',
  'Quantifiers',
  'Redundant_comp',
  'Ref_device',
  'Relative_clause',
  'Spelling',
  'Tense_choice',
  'Tense_form',
  'Verb_pattern',
  'Voice',
  'Word_order',
  'lex_item_choice',
  'lex_part_choice'})

In [24]:
def get_errors_from_ann(ann_file):
    '''Retrieves errors from .ann file'''
    
    annotations = open(ann_file, 'r', encoding='utf-8').readlines()
    errors = []
    for n, ann in enumerate(annotations):
        
        if ann[0] == '#' and 'lemma =' not in ann:
            error_id = ann.split('\t')[1].split()[1]
            error_line = next((i for i in annotations if i.startswith(error_id)), None)
            
            if (error_line and
                
                len(error_line.strip().split('\t')) == 3 and
                
                (len(ann.strip().split('\t')) == 3 or
                 'Delete' in ann or
                 (len(ann.strip().split('\t')) == 2 and
                  ann.strip().split('\t')[1].startswith('AnnotatorNotes')))):
                
                errors.append((error_line.strip().split('\t'), ann.strip().split('\t')))
            else:
                print(ann_file, '\n', ann.strip().split('\t'), '\n', error_line.strip().split('\t'))
    
    return errors

In [27]:
def get_corrections_per_file(errors):
    '''Crates training data from one file'''
    
    corrections = []
    for error, corr in errors:
        # Error info
        tag = error[1].split()[0] # 'Articles' / 'vocab'
        if tag in corpus_tags:
            if len(corr) == 3:
                correction = corr[2] # 'talk'
            else:
                # error has no correction -- the text is deleted
                correction = "_"

            corrections.append((f"{error[2]} → {correction}", tag))
    
    return corrections

In [28]:
def sentences_for_training(path_to_training_files):
    '''Creates sentences for training the model'''
    
    files = set(os.listdir(path_to_training_files))    
    sentences = []
    txt_files = [i for i in files if i.endswith('txt')]
    for file in tqdm(txt_files):
        if file[:-3]+'ann' in files:
            
            ann_file = f'{path_to_training_files}\\{file[:-3]}ann'
            txt_file = f'{path_to_training_files}\\{file}'
            
            errors = get_errors_from_ann(ann_file)
            
            sentences += get_corrections_per_file(errors)
        
                
    return sentences

In [3]:
paths = [paths/to/folders/with/corpus/data]

training = []
for path_to_training_files in tqdm(paths):
    training.extend(sentences_for_training(path_to_training_files))

0it [00:00, ?it/s]

In [8]:
len(training)

113850

In [10]:
training[:20]

[('cigarets → cigarettes', 'Spelling'),
 ('alcohool → alcohol', 'Spelling'),
 ('alcohools → alcohols', 'Spelling'),
 ('cardio-vascular → cardiovascular', 'Spelling'),
 ('wich → which', 'Spelling'),
 ('Nowadays → Nowadays,', 'Punctuation'),
 ('On the other hand → On the other hand,', 'Punctuation'),
 ('an increase → the increase', 'Articles'),
 ('an foundation → the foundation', 'Articles'),
 ('no → not a', 'Negation'),
 ('systeme → system', 'Spelling'),
 ('stades → stadiums', 'Spelling'),
 ('o → or', 'Conjunctions'),
 ("it's → it is", 'Inappropriate_register'),
 ('firstly → , firstly,', 'Punctuation'),
 ('for instance → , for instance .', 'Punctuation'),
 ('population → the population', 'Articles'),
 ('Moreover → Moreover,', 'Punctuation'),
 ('tools → the tools', 'Articles'),
 ('Apart from possibilities of doing sports → Apart from possibilities of doing sports,',
  'Punctuation')]

In [11]:
len(training)

113850

Filter out tags with too few instances in the corpus.

In [13]:
from collections import Counter
tags = Counter(i[1] for i in training)

In [14]:
len(tags), tags.most_common()

(46,
 [('Spelling', 25568),
  ('Articles', 14470),
  ('lex_item_choice', 12201),
  ('Punctuation', 8537),
  ('Tense_choice', 5981),
  ('Prepositions', 4844),
  ('Agreement_errors', 4334),
  ('Noun_number', 4281),
  ('Absence_comp_sent', 3705),
  ('Category_confusion', 3021),
  ('Absence_explanation', 2395),
  ('Word_order', 2224),
  ('lex_part_choice', 2176),
  ('Capitalisation', 1609),
  ('Formational_affixes', 1561),
  ('Inappropriate_register', 1521),
  ('Numerals', 1447),
  ('Ref_device', 1422),
  ('Verb_pattern', 1377),
  ('Tense_form', 1135),
  ('Determiners', 1089),
  ('Voice', 946),
  ('Relative_clause', 936),
  ('Linking_device', 827),
  ('Comparison_degree', 651),
  ('Redundant_comp', 518),
  ('Confusion_of_structures', 515),
  ('Pronouns', 513),
  ('Conjunctions', 472),
  ('Possessive', 448),
  ('Modals', 424),
  ('Prepositional_noun', 379),
  ('Countable_uncountable', 312),
  ('Coherence', 289),
  ('Infinitive_constr', 265),
  ('Comparative_constr', 258),
  ('Participial_co

In [15]:
least_common_tags = [tag for tag, count in tags.items() if count < 500]
least_common_tags

['Negation',
 'Conjunctions',
 'Prepositional_adjective',
 'Possessive',
 'Compound_word',
 'Comparative_constr',
 'Countable_uncountable',
 'Adverbs',
 'Coherence',
 'Quantifiers',
 'Participial_constr',
 'Infinitive_constr',
 'Modals',
 'Lack_par_constr',
 'Prepositional_noun',
 'Prepositional_adv',
 'Noun_inf',
 'Adj_as_collective']

Filter out tags already classified by other tools.

In [16]:
finished_tags = ['Spelling', 'Articles', 'Punctuation', 'Tense_choice', 'Word_order', 'Capitalisation', 'Determiners',
                'Pronouns', 'Conjunctions', 'Possessive', 'Noun_number']

In [17]:
left_out = [t for t in training if t[1] in least_common_tags+finished_tags]
lo = Counter(i[1] for i in left_out)
lo.most_common()

[('Spelling', 25568),
 ('Articles', 14470),
 ('Punctuation', 8537),
 ('Tense_choice', 5981),
 ('Noun_number', 4281),
 ('Word_order', 2224),
 ('Capitalisation', 1609),
 ('Determiners', 1089),
 ('Pronouns', 513),
 ('Conjunctions', 472),
 ('Possessive', 448),
 ('Modals', 424),
 ('Prepositional_noun', 379),
 ('Countable_uncountable', 312),
 ('Coherence', 289),
 ('Infinitive_constr', 265),
 ('Comparative_constr', 258),
 ('Participial_constr', 258),
 ('Compound_word', 243),
 ('Lack_par_constr', 203),
 ('Negation', 173),
 ('Quantifiers', 125),
 ('Prepositional_adjective', 91),
 ('Adverbs', 40),
 ('Adj_as_collective', 28),
 ('Noun_inf', 26),
 ('Prepositional_adv', 12)]

In [18]:
training = [t for t in training if t[1] not in least_common_tags+finished_tags]
len(training)

45532

In [19]:
tags = Counter(i[1] for i in training)
tags.most_common()

[('lex_item_choice', 12201),
 ('Prepositions', 4844),
 ('Agreement_errors', 4334),
 ('Absence_comp_sent', 3705),
 ('Category_confusion', 3021),
 ('Absence_explanation', 2395),
 ('lex_part_choice', 2176),
 ('Formational_affixes', 1561),
 ('Inappropriate_register', 1521),
 ('Numerals', 1447),
 ('Ref_device', 1422),
 ('Verb_pattern', 1377),
 ('Tense_form', 1135),
 ('Voice', 946),
 ('Relative_clause', 936),
 ('Linking_device', 827),
 ('Comparison_degree', 651),
 ('Redundant_comp', 518),
 ('Confusion_of_structures', 515)]

In [21]:
import pickle

pickle.dump(training, open('small_train_eo.pickle', 'wb'))