<a href="https://colab.research.google.com/github/katkorre/elerrant/blob/main/elerrant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
#running this will place everything we need in the directory. 
! git clone https://github.com/chrisjbryant/errant.git

In [2]:
%%capture
#demo given that there is an orig.txt, cor.txt in the directory.
# orig text 
! gdown --id 157UeUcZXrRMEIKr6B9JKWk9hbMVOMMEa
# cor text 
! gdown --id 1Y7x7eAlymUWbJiLSXv3IX1Z7uH1TYrNK

In [3]:
%%capture
#download GREEK hunspell dict
! gdown --id 1zPW8BtFY96STbDOUpGnnq6EcHJq8P3uP

In [4]:
import tarfile
my_tar = tarfile.open('Greek_hunspell.tar.bz2')
my_tar.extractall('/content') # specify which folder to extract to
my_tar.close()

In [5]:
# make directory for elerrant resources and copy hunspell dict in
!mkdir errant/errant/el 
!mkdir errant/errant/el/resources
!cp /content/20110903/el_GR.dic errant/errant/el/resources/el_GR.txt

In [6]:
%%capture
! pip install greek-stemmer
! python -m spacy download el

In [7]:
%%capture
# Instead of this classifier /content/errant/errant/en/classifier.py
# I want to use this one and make sure that it can correct greek sentences
! gdown --id 1glBOnTRSQeGLAXPjpIeSsxhG5qZkWLx3

# move to the right location
!mv gr_classifier.py errant/errant/el/classifier.py
# move en.merger to el.
!cp errant/errant/en/merger.py errant/errant/el/

Downloading...
From: https://drive.google.com/uc?id=1glBOnTRSQeGLAXPjpIeSsxhG5qZkWLx3
To: /content/gr_classifier.py
  0% 0.00/10.7k [00:00<?, ?B/s]100% 10.7k/10.7k [00:00<00:00, 16.5MB/s]


In [8]:
!cp -r errant/errant ./_errant
!rm -r errant
!mv _errant errant

In [12]:
%%writefile errant/__init__.py

from importlib import import_module
import spacy
from errant.annotator import Annotator


# ERRANT version
__version__ = '2.2.3'

# Load an ERRANT Annotator object for a given language
def load(lang, nlp=None):
    # Make sure the language is supported
    supported = {"en", "el"}
    if lang not in supported:
        raise Exception("%s is an unsupported or unknown language" % lang)

    # Load spacy
    nlp = nlp or spacy.load(lang, disable=["ner"])

    # Load language edit merger
    merger = import_module("errant.%s.merger" % lang)

    # Load language edit classifier
    classifier = import_module("errant.%s.classifier" % lang)
    # The English classifier needs spacy
    # Also adding the Greek classifier
    if lang in {"en", "el"}: classifier.nlp = nlp

    # Return a configured ERRANT annotator
    return Annotator(lang, nlp, merger, classifier)

Overwriting errant/__init__.py


In [13]:
!pip install python-Levenshtein
from importlib import import_module
import spacy
from errant import Annotator

# Load an ERRANT Annotator object for a given language
def load(lang, nlp=None):
    # Make sure the language is supported
    supported = {"en", "el"}
    if lang not in supported:
        raise Exception("%s is an unsupported or unknown language" % lang)

    # Load spacy
    nlp = nlp or spacy.load(lang, disable=["ner"])

    # Load language edit merger
    merger = import_module("errant.%s.merger" % lang)

    # Load language edit classifier
    classifier = import_module("errant.%s.classifier" % lang)
    # The English classifier needs spacy
    # Also adding the Greek classifier
    if lang in {"en", "el"}: classifier.nlp = nlp

    # Return a configured ERRANT annotator
    return Annotator(lang, nlp, merger, classifier)



In [14]:
%%writefile errant/el/classifier.py
import re
from pathlib import Path
import Levenshtein
#use greek stemmer https://pypi.org/project/greek-stemmer/
from greek_stemmer import GreekStemmer
import spacy
import spacy.symbols as POS

# Load Greek Hunspell word list
def load_word_list(path):
    with open(path) as word_list:
        return set([word.strip() for word in word_list])



# Classifier resources
base_dir = "errant/errant/en"
# Spacy
nlp = None
# Greek Stemmer
stemmer = GreekStemmer()
# Greek Word list
spell = load_word_list('/content/errant/el/resources/el_GR.txt')

# Rare POS tags that make uninformative error categories
rare_pos = {"INTJ", "NUM", "SYM", "X"}
# Open class coarse Spacy POS tags 
open_pos1 = {POS.ADJ, POS.ADV, POS.NOUN, POS.VERB}
# Open class coarse Spacy POS tags (strings)
open_pos2 = {"ADJ", "ADV", "NOUN", "VERB"}
# POS tags with inflectional morphology
inflected_tags = {"ADJ", "ADV", "AUX", "DET", "PRON", "PROPN", "NOUN", "VERB"}
# Some dep labels that map to pos tags.
dep_map = {"ac": "ADP", "svp": "ADP",	"punct": "PUNCT", "CCONJ": "CONJ" }
# Accents/Vowels
accents=['ά','έ', 'ή', 'ί', 'ό', 'ύ', 'ώ']
#Simplified cats
simple_cats={'CCONJ':'CONJ', 'SCONJ':'CONJ', 'ADP':'PREP' }


# Input: An Edit object
# Output: The same Edit object with an updated error type
def classify(edit):  
    # Nothing to nothing is a detected but not corrected edit
    if not edit.o_toks and not edit.c_toks:
        edit.type = "UNK"
    # Missing
    elif not edit.o_toks and edit.c_toks:
        op = "M:"
        cat = simplify(get_one_sided_type(edit.c_toks))
        edit.type = op+cat   
    # Unnecessary
    elif edit.o_toks and not edit.c_toks:
        op = "U:"
        cat = simplify(get_one_sided_type(edit.o_toks))
        edit.type = op+cat
    # Replacement and special cases
    else:
        # Same to same is a detected but not corrected edit
        if edit.o_str == edit.c_str:
            edit.type = "UNK"
        # Classify the edit as if the last token wasn't there
        elif edit.o_toks[-1].lower == edit.c_toks[-1].lower and \
                (len(edit.o_toks) > 1 or len(edit.c_toks) > 1):
            # Store a copy of the full orig and cor toks
            all_o_toks = edit.o_toks[:]
            all_c_toks = edit.c_toks[:]
            # Truncate the instance toks for classification
            edit.o_toks = edit.o_toks[:-1]
            edit.c_toks = edit.c_toks[:-1]
            # Classify the truncated edit
            edit = classify(edit)
            # Restore the full orig and cor toks
            edit.o_toks = all_o_toks
            edit.c_toks = all_c_toks

        # Accent/Final Nu special cases
        #these need to go to replacement
        elif accent(edit.o_toks, edit.c_toks) == "miss_acc":
          edit.type = "M:ACC"
        elif accent(edit.o_toks, edit.c_toks) == "unn_acc":
          edit.type = "U:ACC"
        elif final_n(edit.o_toks, edit.c_toks) == 'unn_fn':
          edit.type = "U:FN"
        elif final_n(edit.o_toks, edit.c_toks) == 'miss_fn':
          edit.type = "M:FN"
        # Replacement
        else:
            op = "R:"
            cat = simplify(get_two_sided_type(edit.o_toks, edit.c_toks))
            edit.type = op+cat
    return edit



# Input: Spacy tokens
# Output: A list of pos and dep tag strings
def get_edit_info(toks):
    pos = []
    dep = []
    for tok in toks:
        pos.append(tok.tag_)
        dep.append(tok.dep_)
    return pos, dep

# Input: Spacy tokens
# Output: An error type string based on input tokens from orig or cor
# When one side of the edit is null, we can only use the other side
def get_one_sided_type(toks):
    # Special cases
    if len(toks) == 1:
        # Subjunctive "να" is treated as part of a verb form
        if toks[0].lower_ == "να" and toks[0].pos == POS.PART :
            return "VERB:FORM"     
        
    # Extract pos tags and parse info from the toks
    pos_list, dep_list = get_edit_info(toks)
    # Auxiliary verbs e.g "έχω, είχα" 
        # Μέλλοντας "θα"
    if toks[0].lower_ == "θα" and toks[0].pos == POS.PART :
        return "VERB:TENSE"
    if toks[0].pos == POS.VERB and set(dep_list).issubset({"aux", "auxpass", "obj", "advmod"}):
        return "VERB:TENSE"
    # POS-based tags. Ignores rare, uninformative categories
    if len(set(pos_list)) == 1 and pos_list[0] not in rare_pos:
        return pos_list[0]
    # More POS-based tags using special dependency labels
    if len(set(dep_list)) == 1 and dep_list[0] in dep_map.keys():
        return dep_map[dep_list[0]]
    # Tricky cases
    else:
        return "OTHER"


# Input 1: Spacy orig tokens
# Input 2: Spacy cor tokens
# Output: An error type string based on orig AND cor
def get_two_sided_type(o_toks, c_toks):
    # Extract pos tags and parse info from the toks as lists
    o_pos, o_dep = get_edit_info(o_toks)
    c_pos, c_dep = get_edit_info(c_toks)

    # Orthography; i.e. whitespace and/or case errors.
    if only_orth_change(o_toks, c_toks):
        return "ORTH"
    # Word Order; only matches exact reordering.
    if exact_reordering(o_toks, c_toks):
        return "WO"   
             

# 2. SPELLING AND INFLECTION
        # Only check alphabetical strings on the original side
        # Spelling errors take precedence over POS errors; this rule is ordered
        if o_toks[0].text.isalpha():          
            # Check a greek dict for both orig and lower case.            
            if o_toks[0].text not in spell and \
                    o_toks[0].lower_ not in spell:
                # Check if both sides have a common lemma
                if o_toks[0].lemma == c_toks[0].lemma:
                    # Inflection
                    # Spacy issue returns nonetype when does not properly assign pos
                    if o_pos == c_pos and o_pos[0] in {"NOUN", "ADJ", "ADV", "PRON", "VERB"}:
                        return o_pos[0]+":FORM"
                    # Unknown morphology; i.e. we cannot be more specific.
                    else:
                        return "MORPH"
                # Use string similarity to detect true spelling errors.
                else:
                  char_ratio = Levenshtein.ratio(o_toks[0].text, c_toks[0].text)
                  # Ratio > 0.5 means both side share at least half the same chars.
                  # WARNING: THIS IS AN APPROXIMATION.
                  if char_ratio > 0.5:
                      return "SPELL"
                  # If ratio is <= 0.5, the error is more complex
                  else:
                      # If POS is the same, this takes precedence over spelling.
                      if o_pos == c_pos and \
                              o_pos[0] not in rare_pos:
                          return o_pos[0]
                      # Tricky cases.
                      
                      if char_ratio > 0.9:
                        if o_toks[0] not in conts and c_toks[0] not in conts and \
                        accent(o_toks,c_toks) == "repl_acc":
                          return "ACC"
                      else:
                        return "OTHER"   

        # 3. MORPHOLOGY
        # Only ADJ, ADV, NOUN and VERB can have inflectional changes.
        if o_toks[0].lemma == c_toks[0].lemma and \
                o_pos[0] in open_pos2 and \
                c_pos[0] in open_pos2:
            # Same POS on both sides
            if o_pos == c_pos:
                # Adjective form; e.g. comparatives
                if o_pos[0] == "ADJ":
                    return "ADJ:FORM"
                # Noun number
                if o_pos[0] == "NOUN":
                    return "NOUN:FORM"
                # Verbs - various types
                if o_pos[0] == "VERB":
                    # NOTE: These rules are carefully ordered.
                    # Use the dep parse to find some form errors.
                    # Main verbs preceded by aux cannot be tense or SVA.
                    if preceded_by_aux(o_toks, c_toks):
                        return "VERB:FORM"
                if o_pos == c_pos and o_pos[0] == "VERB":
                        return "VERB:FORM"
                        

        # 4. GENERAL
        # Auxiliaries with different lemmas
        if o_dep[0].startswith("aux") and c_dep[0].startswith("aux"):
            return "VERB:TENSE"
        # POS-based tags. Some of these are context sensitive mispellings.
        if o_pos == c_pos and o_pos[0] not in rare_pos:
            return o_pos[0]
        # Some dep labels map to POS-based tags.
        if o_dep == c_dep and o_dep[0] in dep_map.keys():
            return dep_map[o_dep[0]]
        else:
            return "OTHER"

    # Multi-token replacements (uncommon)
    # All auxiliaries
    if set(o_dep+c_dep).issubset({"aux", "auxpass"}):
        return "VERB:TENSE"
    # All same POS
    if len(set(o_pos+c_pos)) == 1:
        # Final verbs with the same lemma are tense
        if o_pos[0] == "VERB" and \
                o_toks[-1].lemma == c_toks[-1].lemma:
            return "VERB:TENSE"
        # POS-based tags.
        elif o_pos[0] not in rare_pos:
            return o_pos[0]
    # All same special dep labels.
    if len(set(o_dep+c_dep)) == 1 and \
            o_dep[0] in dep_map.keys():
        return dep_map[o_dep[0]]
    # Verbs with particles e.g., μην κάνεις
    if set(o_pos+c_pos) == {"PART", "VERB"}:
      
        if o_toks[-1].lemma == c_toks[-1].lemma:
            return "VERB:FORM"
        # In case particle needs to go
        else:
            return "VERB"
    # Tricky cases.
    else:
        return "OTHER"


def only_orth_change(o_toks, c_toks):
    o_join = "".join([o.lower_ for o in o_toks])
    c_join = "".join([c.lower_ for c in c_toks])
    if o_join == c_join:
        return True
    return False

    


# Input 1: Spacy orig tokens
# Input 2: Spacy cor tokens
# Output: Boolean; the tokens are exactly the same but in a different order
def exact_reordering(o_toks, c_toks):
    # Sorting lets us keep duplicates.
    o_set = sorted([o.lower_ for o in o_toks])
    c_set = sorted([c.lower_ for c in c_toks])
    if o_set == c_set:
        return True
    return False



def accent(o_toks, c_toks):
  o_toks = str(o_toks)
  c_toks = str(c_toks)
  o_chars =[char for char in o_toks]
  c_chars = [char for char in c_toks]
   
  char_list1=[]  
  char_list2=[]  

  if set(o_chars).isdisjoint(accents) == False and set(c_chars).isdisjoint(accents) == True:
    return "unn_acc"
  elif set(o_chars).isdisjoint(accents) == True and set(c_chars).isdisjoint(accents) == False:
   return "miss_acc"
  elif set(o_chars).isdisjoint(accents) == False and set(c_chars).isdisjoint(accents) == False:
    char1 = list(set(o_chars).intersection(accents))
    char2 = list(set(c_chars).intersection(accents))
    if len(char2)>1:
      return 'miss_acc'
    elif len(char1)>1:
      return 'unn_acc'
    else:
      len(char1) == len(char2)
      if o_chars.index(char1[0]) != c_chars.index(char2[0]): 
        return'repl_acc'

    
def final_n(o_toks, c_toks):
  o_toks = str(o_toks)
  c_toks = str(c_toks)
  o_chars =[char for char in o_toks]
  c_chars = [char for char in c_toks]

  if o_chars == c_chars[:-1] and c_chars[-1]=='ν':
    return "miss_fn"
  elif o_chars[:-1] == c_chars and o_chars[-1]=='ν':
    return "unn_fn"

def simplify(cat):
  if cat in simple_cats:
    cat = simple_cats.get(cat)
  return cat

Overwriting errant/el/classifier.py


In [17]:
import errant
import argparse
from contextlib import ExitStack
import errant

def parallel(out, orig, cor, tok = False, merge ='rules', lev = False ):
  print("Loading resources...")
  # Load Errant
  annotator = errant.load("en")
  # Open output m2 file
  out_m2 = open(out, "w")

  print("Processing parallel files...")
  # Process an arbitrary number of files line by line simultaneously. Python 3.3+
  # See https://tinyurl.com/y4cj4gth
  with ExitStack() as stack:
      in_files = [stack.enter_context(open(i)) for i in [orig]+cor]
      # Process each line of all input files
      for line in zip(*in_files):
          # Get the original and all the corrected texts
          orig = line[0].strip()
          cors = line[1:]
          # Skip the line if orig is empty
          if not orig: continue
          # Parse orig with spacy
          orig = annotator.parse(orig, tok)
          # Write orig to the output m2 file
          out_m2.write(" ".join(["S"]+[token.text for token in orig])+"\n")
          # Loop through the corrected texts
          for cor_id, cor in enumerate(cors):
              cor = cor.strip()
              # If the texts are the same, write a noop edit
              if orig.text.strip() == cor:
                  out_m2.write(noop_edit(cor_id)+"\n")
              # Otherwise, do extra processing
              else:
                  # Parse cor with spacy
                  cor = annotator.parse(cor, tok)
                  # Align the texts and extract and classify the edits
                  edits = annotator.annotate(orig, cor, lev, merge)
                  # Loop through the edits
                  for edit in edits:
                      # Write the edit to the output m2 file
                      out_m2.write(edit.to_m2(cor_id)+"\n")
          # Write a newline when we have processed all corrections for each line
          out_m2.write("\n")
          
# Input: A coder id
# Output: A noop edit; i.e. text contains no edits
def noop_edit(id=0):
    return "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(id)

In [None]:
parallel('out_m2','orig.txt',['corr.txt'])