In [1]:
import nlp2
import ngrams
import re
from collections import defaultdict

In [2]:
def load_charmap(filename):
    vocab = set()
    pro_dict = defaultdict(list)
    with open(filename) as f:
        for line in f:
            chinese_char, pron = line.split()
            pro_dict[pron].append(chinese_char)
            vocab.add(chinese_char)
    return pro_dict, vocab


In [3]:
charmap, vocab = load_charmap('data/charmap')
print (len(charmap['yi']))

484


In [11]:
def chinese_tokenize(text):
    tokens = []
    token = ''
    for c in text:
        # Accumulate letters and numbers
        if re.match(r'[a-zA-Z_0-9]', c):
            token += c
        # For other characters add to token list
        else:
            if token:
                tokens.append(token)
                token = ''

            tokens.append(c.replace(' ', '<space>'))
    # If we have accumulated letters or numbers add to tokens
    if token:
        tokens.append(token)
    return tokens


# Start and end tokens
start = ['<s>']
end = ['</s>']

#for sample in training_data[-20:]:
#    print (sample)

training_data = []
with open('data/train.han') as f:
    for line in f:
        # Tokenize
        sample = chinese_tokenize(line.strip('\n'))
        # Add sample (with start and end tokens)
        training_data.append(start + sample + end)
        

class WittenBellModel():
    def __init__(self, order, training_data):
        
        # Get total number of tokens
        self.N = sum([len(sent) for sent in training_data])

        # Initialize models (up to order)
        self.models = []
        for i in range(1, order+1):
            self.models.append(nlp2.train(ngrams.Model, i, training_data))
            
        self.n_grams = []
        for i in range(1,order):
            cur_grams = self.create_ngrams(i,training_data)
            self.n_grams.append(cur_grams)
        print("Done creating ngrams!")
            
    def create_ngrams(self, n, data):
        n_grams = {}
        n_gram = ()
        
        for sentence in training_data:
            for i in range(max(0,len(sentence)-n-1)):
                n_plus_gram = tuple([sentence[j] for j in range(i,i+n+1)])
                if n_plus_gram[:-1] not in n_grams:
                    n_grams[n_plus_gram[:-1]] = [n_plus_gram]
                else:
                    n_grams[n_plus_gram[:-1]].append(n_plus_gram)
                #print(n_gram)
        for key in n_grams:
            n_grams[key] = set(n_grams[key])
        return n_grams
            
    def prob(self, ctxt, word):
        model = self.models[len(ctxt)]
        # If unigram (no context)
        if not ctxt:
            lambda_u = self.N / (self.N + len(model.vocabulary()))
            # Try to calculate max likelihood
            try:
                max_likelihood = model.freq(ctxt, word)/self.N
            except:
                max_likelihood = 0
            return lambda_u * max_likelihood + (1 - lambda_u) * (1 / len(model.vocabulary()))

        # If bigram or higher
        
        
        #----------------------------------------
        
        
        #-----------------------------------------
        
        #-----------------------------------------
        #v_model = self.models[len(ctxt)-1]
        #for w in v_model.vocabulary():
        #    if v_model.prob(ctxt[:-1],w):
        #        v1 +=1
        #print("V1: ", v1)
        #------------------------------------------
        
        lambda_u = 0.5
        #print("V1: ",v1)
        #print("TOTAL: ", model.total(ctxt))
        try:
            v1 = len(self.n_grams[len(ctxt)-1][ctxt])
            lambda_u = model.total(ctxt) / (model.total(ctxt) + v1)
        except:
            lambda_u = 0
            
        #print(lambda_u)
        # Try to calculate max likelihood
        try:
            max_likelihood = model.freq(ctxt, word) / model.total(ctxt)
        except:
            max_likelihood = 0
             
        # Make recursive call
        return lambda_u * max_likelihood + (1 - lambda_u) * self.prob(ctxt[1:], word)



In [5]:
# Read in test pronounciations
test_x = []
with open('data/test.pin') as f:
    for line in f:
        test_x.append(line.strip('\n').split()) 

# Read in chinese test characters
test_y = []
with open('data/test.han') as f:
    for line in f:
        test_y.append(line.split())

In [13]:
order = 4
model = WittenBellModel(order, training_data)

preds = []
for sent in test_x:
    pred = []
    for i, word in enumerate(sent):
        if len(word) == 1 or word not in charmap:
            pred.append(word)
            continue
        
        # Select best candidate
        best = {'word': '', 'p': 0}
        for cand in charmap[word]:
            ctxt = ()
            if order > 1:
                pred_tokens = start + chinese_tokenize(pred)
                if order-1 > len(pred_tokens):
                    ctxt = tuple(pred_tokens)
                else:
                    ctxt = tuple(pred_tokens[-order+1:])
                
            p = model.prob(ctxt, cand)
            if p > best['p']:
                best = {'word': cand, 'p': p}
        pred.append(best['word'])
        
    preds.append("".join(pred).split('<space>'))

# Calculate accuracy as frequency of correct classified characters
correct = 0
total_c = 0
for pred, golden in zip(preds, test_y):
    for p_w, g_w in zip(pred, golden):
        for p_c, g_c in zip(p_w, g_w):
            if p_c == g_c:
                correct += 1
            total_c += 1
print (correct / total_c)

Done creating ngrams!
0.8179366149696561
