# Character Ngram Language Detector

In [12]:
import sys
import re
import io
import os
from collections import Counter

In [2]:
def toWords(text):  # separates punctuation
    token = re.compile(u'[^\s\w-]|[\w-]+', re.UNICODE) #pattern to identify words, punct, and hyphenated words
    tokens = re.findall(token, text)
    return [word.lower() for word in tokens]

In [3]:
# Character Ngrams
import itertools
import math

def getNGrams(text, n):
  text = (" " * (n - 1)) + text + " "
  return [text[i:i+n] for i in range(len(text) - n + 1)]


""" Creates the conditional frequency distribution

    @param words a list of words from the training data
    @param n the length of an n-gram
    @return a mapping of context (unique substring of the first n-1 characters)
    to endings (last character) and their frequencies

  for each sentence in list of sentences:
        get n-grams using above method
        group by first n-1 characters (context)
        for each unique context, create a map of last character counts
          e.g., if you have the ngrams "chair" and "chain":
                the context would be "chai" and the last character counts
                would be "r -> 1" and "n -> 1"
        add the [ context -> last character counts ] mapping to your
        ultimate CFD (also a map)
          e.g., your CFD  would have lots of entries that look like:
                "chai" -> {"r" -> 1, "n -> 1}
                as an key-value pair.
"""
def getConditionalCounts(words, n):
  condCounts = {}
  for word in words:
    ngrams = getNGrams(word, n)
    for gram in ngrams:
      context, lastChar = gram[:n - 1], gram[-1]
      condCounts.setdefault(context, {}).setdefault(lastChar, 0)
      condCounts[context][lastChar] += 1
  return condCounts



class CharNGram:
  def __init__(self, language, conditionalCounts, n, numLetters=26):
    self.language = language
    self.condCounts = conditionalCounts
    self.n = n
    self.numLetters = numLetters
    self._getNormalizedCounts()

  def _getNormalizedCounts(self):
    for ctx, counts in self.condCounts.items():
      for lastChar, count in counts.items():
        self.condCounts[ctx][lastChar] = (count + 1)/float(self.numLetters)

    """
    Using conditional frequency distribution,
    calculate and return p(c | ctx)
    """
  def ngramProb(self, ctx, c):
    return self.condCounts.get(ctx, {}).get(c, 1.0/float(self.numLetters))

    """ Multiply ngram probabilites for each ngram in word """
  def wordProb(self, word):
    prob = 1.0
    for ctx, counts in getConditionalCounts([word], self.n).items():
      for lastChar, count in counts.items():
        prob *= self.ngramProb(ctx, lastChar) * count
    return math.log(prob)


In [4]:
class CodeSwitchedLanguageModel:
  def __init__(self, models):
    self.models = models

  def guess(self, word):
    highestProb = max(model.wordProb(word.lower()) for model in self.models)
    guess = [model for model in self.models
                   if model.wordProb(word.lower()) == highestProb]
    return guess[0].language

  def prob(self, language, word):
    return [model for model in self.models
                  if model.language == language][0].wordProb(word.lower())


In [5]:
class Annotation:
    def __init__(self, words, cslm):
        self.words = words
        self.cslm = cslm
        self.tagSet = [u"Eng", u"Spn"]
        self.lang = []
        self.engProbs = []
        self.spnProbs = []
        self._generateTags()

    def _generateTags(self):
        punct_pattern = re.compile(u'[^\w\s]', re.UNICODE)
        for k, word in enumerate(self.words):

            # annotate punct and move to next token
            if re.match(punct_pattern, word):
                self.lang.append('Punct')
                self.engProbs.append("NA")
                self.spnProbs.append("NA")
                continue

            # annotate numbers and move to next token
            if re.match(u'\d', word):
                self.lang.append('Num')
                self.engProbs.append("NA")
                self.spnProbs.append("NA")
                continue

            # for lexical tokens determine lang tag
            spnProb = self.cslm.prob("Spn", word); self.spnProbs.append(spnProb)
            engProb = self.cslm.prob("Eng", word); self.engProbs.append(engProb)
            
            if .9 < engProb/spnProb < 1.1:
                print(word)
                try:
                    lang = self.lang[k-1] # default to previous language tag
                    self.lang.append(lang + "?")
                except IndexError:
                    lang = self.cslm.guess(word)
                    self.lang.append(lang)
            else:
                lang = self.cslm.guess(word)
                self.lang.append(lang)

In [6]:
class langDetector:
    def __init__(self):
        self.cslm = self._setup()

    def _setup(self):
        n = 4
        engPath = "Data/EnglishTrainingCorpus.txt"
        engData = toWords(io.open(engPath, 'r', encoding='utf8').read())

        spnPath = "Data/SpanishTrainingCorpus.txt"
        spnData = toWords(io.open(spnPath, 'r', encoding='utf8').read())

        enModel = CharNGram('Eng', getConditionalCounts(engData, n), n)
        esModel = CharNGram('Spn', getConditionalCounts(spnData, n), n)

        return CodeSwitchedLanguageModel([enModel, esModel])

    def tag(self, text_list):
        # annotation_lists = []
        hmm = Annotation(text_list, self.cslm)
        return hmm.lang


In [7]:
langDetector = langDetector()

In [8]:
langDetector.tag(["hola", "hello"])

['Spn', 'Eng']

In [9]:
KC = """En Sudáfrica, llegué a creer que la vida era eso, precisely: the otherwordly, ephemeral beauty de los jacarandaes and, in equal measure, the clawing loneliness of having just three people en todo el continente africano."""

In [10]:
KC_tokens = toWords(KC)
KC_tags = langDetector.tag(KC_tokens)
print(len(KC_tokens), len(KC_tags))
for token, tag in zip(KC_tokens, KC_tags):
    print(token,tag)

a
africano
42 42
en Spn
sudáfrica Spn
, Punct
llegué Spn
a Spn?
creer Spn
que Spn
la Spn
vida Spn
era Spn
eso Spn
, Punct
precisely Eng
: Punct
the Eng
otherwordly Eng
, Punct
ephemeral Eng
beauty Eng
de Spn
los Spn
jacarandaes Spn
and Eng
, Punct
in Eng
equal Eng
measure Eng
, Punct
the Eng
clawing Eng
loneliness Eng
of Eng
having Eng
just Eng
three Eng
people Eng
en Spn
todo Spn
el Spn
continente Spn
africano Spn?
. Punct


## Your turn / ¡Te toca!

Test the character ngram model on codeswitched book and interview. 
For each document, what percentage is Spanish? English? Which is more balanced between the two languages?

In [11]:
cs_book = open("Data/CodeswitchedBook.txt").read()
cs_interview = open("Data/Spanish_in_Texas_subset.txt").read()