The purpose of this notebook is to extract all erroneous bi- and tri-grams from academic texts. The resulting file is a text file of in-context erroneous n-grams, separeted by new lines.

Ex: `В работе я <error> расследовал вопрос </error>`

Collocations are selected and saved to a .txt file in the format `pos\tn-gram\tcontext\n`, where `pos` is the part-of-speech (PoS) tag, `colloc` is the extracted collocation, and `context` is the sentence from which the collocation was extracted.

## Extracts all sentences into a hash table of {lemmatized: original}

In [5]:
import io
import re

from pathlib import Path
from zipfile import ZipFile

from src.html_preprocessor import HtmlPreprocessor
from src.tokenizer import Tokenizer

file_path = "./data/evaluation_texts.zip"

tokenizer = Tokenizer(Tokenizer.Method.TREETAGGER)
exclude_hyphenated = re.compile(r"(из|по|ак|что|какого|какой|каких|какая|какое|какие|каким|какому|какими)-", re.MULTILINE | re.IGNORECASE)
hyphenated = re.compile(r"([А-яЁё])-", re.MULTILINE)

# This sometimes occurs after preprocessing a sentence with quotes, ex: Letters "a", "b" and "c" exist. -> Letters , and exist.
skeleton_punct = re.compile(" [\"#$%&'()*+,\-\/:;<=>@[\]^_`{|}~ʹ…〈〉«»—„“]", re.MULTILINE)
space_before_final_period = re.compile(" \.$", re.MULTILINE)

# Hashmap, mapping lemmatized example sentences to the associated original sentences
examples = {}

with ZipFile(file_path, 'r') as zipped:
    for file_name in zipped.namelist():
        with zipped.open(file_name) as file:
            preprocessor = HtmlPreprocessor("", file_name)
            preprocessor.phase = HtmlPreprocessor.Phase.LISTS_OF_PAGES_AND_PARAGRAPHS
            preprocessor.text = [[]] # Pretend there is only 1 page
            
            for line in io.TextIOWrapper(file, 'utf-8'):
                line = line.strip()
                
                # Remove empty lines, headers and tables
                if line != '' and not line[0].isnumeric() and \
                not line[:8] == "Таблица " and not line[:11] == "Фотография ":
                    # De-hyphenate the text
                    # Avoid de-hyphenating words like 'из-за' or 'по-русски'
                    line = re.sub(exclude_hyphenated, r'\1--', line)
                    line = re.sub(hyphenated, r'\1', line)
                    preprocessor.text[0].append(line)
            
            preprocessor.substitute_end_of_sentence_punctuation_with_period()
            preprocessor.remove_quotations()
            preprocessor.remove_empty_paragraphs()
            preprocessor.remove_intext_references()
            preprocessor.remove_oov_tokens()
            preprocessor.replace_numbers()
        
            preprocessor.break_paragraphs_into_sentences()
            preprocessor.remove_unwanted_sentences()
            
            # Remove title and journal
            preprocessor.text[0] = preprocessor.text[0][2:]
            
            # Lemmatize with treetagger
            for i in range(len(preprocessor.text[0])):
                for j in range(len(preprocessor.text[0][i])):
                    # Remove skeleton lists
                    preprocessor.text[0][i][j] = re.sub(skeleton_punct, "", preprocessor.text[0][i][j])
                
                    # Fix instances where there is a space before the final period
                    preprocessor.text[0][i][j] = re.sub(space_before_final_period, ".", preprocessor.text[0][i][j])
                
                
                for original, lemmatized in zip(preprocessor.text[0][i], tokenizer.tokenize(preprocessor.text[0][i])):
                    # Remove skeleton lists
                    examples[lemmatized.lower()] = original
data = [sent.split(" ") for sent in examples.keys()]

  re.IGNORECASE | re.VERBOSE)
  re.VERBOSE | re.IGNORECASE)
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


## Identify relevant bi- and tri-grams

There are two methods for selecting collocations for extraction:
    1. Gensim's Phrases library
    2. Part-of-Speech filterning

The former involves the usage of gensim's Phrases library. Connector words are extracted and passed through this library so as to not impact the calculation of what is determined to be a 'phrase'. These connector words are conjunctions, particles, prepositions, and adverbs.

The latter extracts all n-grams that match a PoS template. The `min_freq` and `max_freq` variables are the lower and upper bounds for the acceptible frequency. The following PoS templates are used:

- v_n: verb + noun
- n_n: noun + noun
- a_n: adjective + noun
- v_v: verb,inf + verb
- v_s_n: verb + preposition + noun
- n_s_n: noun + preposition + noun

In both methods, only those selected collocations which are not attested in the cybercat database are written to file.

In [None]:
# Configuration variables
save_location = "./data/extract_wrong_colloc_pos_filter_min2_max3_all.txt"
domain = "sys" # or cybercat
host = "localhost" # or IP address
user = "root"
pwd = "enter_password_here"

### Gensim method

In [8]:
# Get the connector words - for gensim
RUSSIAN_CONNECTOR_WORDS = set()
for sent in examples.keys():
    for word in sent.split(" "):
        if word[-2:] in ("_c", "_s", "_p", "_q"):
            RUSSIAN_CONNECTOR_WORDS.add(word)
print(RUSSIAN_CONNECTOR_WORDS)

{'что-то_p', 'куда_p', 'возле_s', 'там_p', 'хотя_q', 'к_s', 'так_p', 'фон_q', 'касательно_s', 'и_c', 'где_p', 'то_p', 'никогда_p', 'кто_p', 'же_c', 'спустя_s', 'откуда_p', 'всегда_p', 'это_p', 'оный_p', 'пока_c', 'под_s', 'что-нибудь_p', 'который_p', 'всетаки_q', 'прежде_s', 'без_s', 'многие_p', 'num_arab-го_q', 'со_s', 'над_s', 'ли_q', 'против_s', 'её_s', 'они_p', 'средь_s', 'об_s', 'оттуда_p', 'каковой_p', 'как-то_p', 'путем_s', 'туда_p', 'самый_p', 'сейчас_p', 'экс_q', 'аль_q', 'ни_q', 'до_s', 'будто_q', 'якобы_q', 'кроме_s', 'де_q', 'вместо_s', 'ваш_p', 'да_c', 'однако_c', 'вроде_s', 'по-другому_p', 'почему_p', 'поскольку_c', 'только_q', 'пусть_c', 'здесь_p', 'его_p', 'перед_s', 'я_p', 'через_s', 'о_s', 'много_p', 'просто_q', 'другой_p', 'то_q', 'при_s', 'нигде_p', 'б_q', 'всякий_p', 'посредством_s', 'либо_c', 'посреди_s', 'напротив_s', 'включая_s', 'сюда_p', 'но_c', 'ввиду_s', 'то_c', 'внутри_s', 'по_s', 'из-за_s', 'у_s', 'никто_p', 'один_p', 'тогда_p', 'иной_p', 'не_q', 'как_p', 

In [9]:
# Use gensim's phrases library
from gensim.models import Phrases

bigrams = Phrases(data, 
                  #min_count=1, threshold=5, 
                  delimiter=' ', connector_words=RUSSIAN_CONNECTOR_WORDS)
trigrams = Phrases(bigrams[data], 
                   #min_count=1, 
                   delimiter=' ', connector_words=RUSSIAN_CONNECTOR_WORDS)

In [11]:
import codecs
import re

from src.collocation_attestor import CollocationAttestor

with CollocationAttestor(domain=domain, host=host, user=user, password=pwd) as attestor:
    with codecs.open(save_location, 'a', 'utf-8') as out_file:
        out_file.write("pos\tn-gram\tcontext\n")
        for sent in data:
            bigrams_ = [b for b in bigrams[sent] if b.count(' ') == 1]
            trigrams_ = [t for t in trigrams[bigrams[sent]] if t.count(' ') == 2]

            if len(bigrams_) > 0:
                for bigram in bigrams_:
                    # Check that bigram is not attested
                    if "num_arab" not in bigram and \
                    len(attestor.attest_collocations([[bi.split("_")[0]] for bi in bigram.split()])) == 0:
                        idx = bigrams[sent].index(bigram)
                        # Adjust the index if there are bigrams up to idx
                        for bi in bigrams[sent][:idx]:
                            idx += len(bi.split()) - 1
                        
                        # Build the in-context erroneous n-gram sentence
                        colloc = " ".join([bi.split("_")[0] for bi in bigram.split()])
                        pos = " ".join([bi.split("_")[1] for bi in bigram.split()])
                        
                        original = examples[" ".join(sent)].split()
                        context = " ".join(original[:idx]) + " <error> " + \
                              " ".join(original[idx:idx+2]) + " </error> " + \
                              " ".join(original[idx+2:])
                        out_file.write(pos + "\t" + colloc + "\t" + context + "\n")

            if len(trigrams_) > 0:
                for trigram in trigrams_:
                    # Check that trigram is not attested
                    if "num_arab" not in trigram and \
                    len(attestor.attest_collocations([[tri.split("_")[0]] for tri in trigram.split()])) == 0:
                        idx = trigrams[bigrams[sent]].index(trigram)
                        # Adjust the index if there are bigrams or trigrams up to idx
                        for tri in trigrams[bigrams[sent]][:idx]:
                            idx += len(tri.split()) - 1

                        # Build the in-context erroneous n-gram sentence
                        colloc = " ".join([tri.split("_")[0] for tri in trigram.split()])
                        pos = " ".join([tri.split("_")[1] for tri in trigram.split()])
                        
                        original = examples[" ".join(sent)].split()
                        context = " ".join(original[:idx]) + " <error> " + \
                              " ".join(original[idx:idx+3]) + " </error> " + \
                              " ".join(original[idx+3:])
                                                
                        out_file.write(pos + "\t" + colloc + "\t" + context + "\n")

### PoS filter method

In [11]:
import codecs
import re
import math

from src.collocation_attestor import CollocationAttestor

# Use PoS filter
filters = [
    "v_n",
    "n_n",
    "a_n",
    "v_v",
    "v_s_n", #V+Prep+N
    "n_s_n" #N+Prep+N
]

min_freq = 2
max_freq = 3

name = re.compile("[А-яЁё]\.")

collocations = {}

with CollocationAttestor(domain=domain, host=host, user=user, password=pwd) as attestor:
    for sent in data:
        lemmas, pos_tags = zip(*[word.split("_") for word in sent])
        pos_tags = "_"+"_".join(pos_tags)
        original = examples[" ".join(sent)].split()

        # Find all matches for each filter
        for f in filters:
            for match in re.finditer('(?={0})'.format(re.escape(f)), pos_tags):
                start = math.floor(match.start() / 2)
                length = math.floor(len(f)/2) + 1

                lemma = lemmas[start:start+length]
                lemma_str = " ".join(lemma)
                pos_lemma = f + "&" + lemma_str

                # Check that lemma isn't attested
                if "num_arab" not in lemma_str and len(lemma_str.strip()) > 0 and \
                len(attestor.attest_collocations([[lem] for lem in lemma])) == 0:
                    # Adjust idx if there are names
                    for word in original[:start]:
                        if name.match(word):
                            start += 1

                    # Check that there are no names in the ngram
                    is_clear = 1
                    for word in original[start:start+length]:
                        if name.match(word):
                            is_clear = 0
                            break
                    if not is_clear:
                        break
                        
                    # Add the example to the collcations dicitonary
                    if pos_lemma not in collocations:
                        collocations[pos_lemma] = []
                    elif len(collocations[pos_lemma]) > max_freq:
                        continue
                    
                    # Build the in-context erroneous n-gram sentence
                    context = " ".join(original[:start]) + " <error> " + \
                        " ".join(original[start:start+length]) + " </error> " + \
                        " ".join(original[start+length:])
                    collocations[pos_lemma].append(context)

# Write all the collocations that surpass the min_freq threshold
with codecs.open(save_location, 'a', 'utf-8') as out_file:
    out_file.write("pos\tn-gram\tcontext\n")
    for pos_lemma, contexts in collocations.items():
        if len(contexts) < min_freq or len(contexts) > max_freq:
            continue
        else:
            for context in contexts:
                pos, lemma = pos_lemma.split("&")
                out_file.write(pos + "\t" + lemma + "\t" + context + "\n")