In [3]:
!pip install nltk
!pip install sacremoses
!pip install pyspellchecker
!pip install contractions

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m4.0 MB/s[0

In [4]:
from spellchecker import SpellChecker
from sacremoses import MosesTruecaser, MosesTokenizer, MosesPunctNormalizer
import contractions
from enum import Enum
import re
import statistics as stats
import math
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
class PreprocessStrategies(Enum):
  NORMALIZE_PUNCT = 1
  REMOVE_PUNCT = 2
  STOPWORDS = 3
  CONTRACTIONS = 4
  SPELLCHECK = 5
  TRUECASE = 6

In [None]:
class Preprocessor:
  def __init__(self):
    self.spell = SpellChecker()
    self.normalizer = MosesPunctNormalizer()
    self.strategies = []
    self.text = []

  def __remove_punctuation(self, text):
    return re.sub(r'(?<!\w)[^\s\w]|(?!\w)[^\s\w]', '', text)


  def __expand_contractions(self, text):
    return contractions.fix(text)


  def __spellcheck(self, text):
    out = []
    for word in text.split(' '):
      if not word or any(chr.isdigit() for chr in word): continue
      corrected = word[0].isupper() and word or self.spell.correction(word)
      corrected = corrected is not None and corrected or word
      out.append(corrected)

    return " ".join(out)


  def __normalize_punctuation(self, text):
    return self.normalizer.normalize(text)


  def __train_truecasers(self):
    tokenizer = MosesTokenizer()
    truecaser_indi = MosesTruecaser()
    truecaser_eng = MosesTruecaser()

    tokenized_indi = [tokenizer.tokenize(line[0]) for line in self.text]
    tokenized_eng = [tokenizer.tokenize(line[1]) for line in self.text]
    truecaser_indi.train(tokenized_indi, save_to='/content/drive/MyDrive/MTApplication/models/indi.truecasemodel')
    truecaser_eng.train(tokenized_eng, save_to='/content/drive/MyDrive/MTApplication/models/eng.truecasemodel')

    self.truecaser_indi = MosesTruecaser('/content/drive/MyDrive/MTApplication/models/indi.truecasemodel')
    self.truecaser_eng = MosesTruecaser('/content/drive/MyDrive/MTApplication/models/eng.truecasemodel')


  def __truecase(self, source, target):
    return (" ".join(self.truecaser_indi.truecase(source)),
            " ".join(self.truecaser_eng.truecase(target)))


  def __normalize_spaces(self, text):
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Separate improperly joined words
    text = re.sub(r'\s+', ' ', text) # Collapse spaces
    return text


  def __remove_paranthetical(self, text):
    return re.sub(r'\([^)]*\)', '', text)


  def __handle_one_to_many(self, source, target):
    if ";" not in target:
      return (source, target)

    source_split = source.split(' ')
    target_split = target.split(';')

    if len(source_split) != 1:
      return (source, target)

    return (source, min(target_split, key=len).lower())


  def __purge_non_english(self, text):
    return re.sub("[^\u0000-\u05C0\u2100-\u214F]+", '', text)


  def __filter(self, source, target):
    source_split = [w for w in source.split(' ') if w != ""]
    target_split = [w for w in target.split(' ') if w != ""]
    diff = math.pow((abs(len(source) - len(target))), 1/3)

    if not source_split or not target_split:
      return True

    if "no record" in source.lower():
      return True

    if "no chinese record" in target.lower():
      return True

    if len(source_split) > 150 or len(target_split) > 150:
      return True

    if len(max(source_split, key=len)) > 20 or len(max(target_split, key=len)) > 20:
      return True

    if diff < self.f_heuristic[0] and diff != 0 or diff > self.f_heuristic[1]:
      return True

    return False


  def __dedupe(self, text):
    toks = nltk.sent_tokenize(text)
    deduped = list(dict.fromkeys(toks))
    return " ".join([s for s in deduped])


  def __replace_seq(self, text: str, old: str, new: str):
    return text.replace(old, new)


  def __calc_fertility_heuristic(self):
    diffs = []

    for line in self.text:
      src = line[0]
      tgt = line[1]
      diff = math.pow((abs(len(src) - len(tgt))), 1/3)
      diffs.append(diff)

    self.f_heuristic = (stats.mean(diffs) - stats.stdev(diffs)*2,
                               stats.mean(diffs) + stats.stdev(diffs)*2)


  def partition(self):
    size = len(self.text)
    step = size // 4
    result = [(i, i + step) for i in range(0, size, step)]
    result[-1] = (result[-1][0], size)
    return result

  def set_strategy(self, strategies: list[PreprocessStrategies]):
    self.strategies = list(set(strategies)).sort(key=lambda x: x.value)


  def load_text(self, text: list[dict]):
    self.text = [(item['Amis'], item['English']) for item in text]
    self.__train_truecasers()
    self.__calc_fertility_heuristic()


  def get_text(self):
    return self.text


  def process(self, partition: tuple[int, int]):
    processed = set()

    for i in range(partition[0], partition[1]):
      print(f"\r{i+1}/{len(self.text)}", end='')
      pair = self.text[i]
      indigenous = pair[0]
      english = pair[1]
      indigenous = self.__normalize_punctuation(indigenous)
      english = self.__normalize_punctuation(english)
      english = self.__purge_non_english(english)
      english = self.__replace_seq(english, "sth.", "something")
      english = self.__replace_seq(english, "sb.", "somebody")
      english = self.__replace_seq(english, "-", " ")
      english = self.__expand_contractions(english)
      (indigenous, english) = self.__truecase(indigenous, english)
      english = self.__dedupe(english)
      english = self.__remove_paranthetical(english)
      indigenous = self.__remove_paranthetical(indigenous)
      (indigenous, english) = self.__handle_one_to_many(indigenous, english)
      indigenous = self.__remove_punctuation(indigenous)
      english = self.__remove_punctuation(english)
      english = self.__normalize_spaces(english)
      indigenous = self.__normalize_spaces(indigenous)
      english = self.__spellcheck(english)
      indigenous = self.__replace_seq(indigenous, 'o', 'u')
      if self.__filter(indigenous, english): continue
      processed.add((indigenous, english))

    return list(processed)