In [1]:
import os
import zipfile
import re
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.util import everygrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tatoebatools import ParallelCorpus
import random
import pinyin

lemmatizer = WordNetLemmatizer()

In [2]:
#The corpus comes as 15 xml files of different genre categories. This takes the zipped folder of files and adds the text of each to one string to be used as one corpus.
corpus = b''
with zipfile.ZipFile('pinyin.zip') as z:
    for filename in z.namelist():
        if not os.path.isdir(filename):
            with z.open(filename) as f:
                corpus += f.read()
corpus[:1000]

b'<?xml version="1.0" encoding="utf-8"?>\r\n<LCMC ver="pinyin"><header><corpusDesc><corpusName>The Lancaster Corpus of Mandarin Chinese </corpusName><creator>Created by the Department of Linguistics, Lancaster University </creator><funding>Funded by the Economic and social Research Council (ESRC), UK </funding><designer>Designed by Anthony McEnery and Zhonghua Xiao </designer><supervision>Supervised by Anthony McEnery </supervision><textcollect>Texts collected by Zhonghua Xiao </textcollect><proofread>Electronic texts proofread and corrected by Zhonghua Xiao and Xin Huang </proofread><POStag>Segmented and POS-tagged by Zhonghua Xiao </POStag><PinyinConv>Converted into Pinyin by NJStar Chinese Word Processor for Windows Version 4.35</PinyinConv><unicodify>Converted into Unicode by Multilingual Corpus Tools (MLCT) developed by Scott Piao and Andrew Wilson </unicodify></corpusDesc><publication><publisher>Department of Linguistics, Lancaster University, LA1 4YT, UK </publisher><availabilit

In [3]:
#make a list of sentences as indicated in the corpus by the sentence tags
xmlSentences = re.findall(b'<s n="\d+"> (.+) </s>', corpus)

sentences = []
for sentence in xmlSentences:
    #remove punctuation and gaps
    sentence = re.sub(b'<c POS="..?">\S+</c>|<gap>omission</gap>', b'', sentence)
    #remove all other tags
    sentence = re.sub(b'<w POS=" ?..? ?">|</. ?>|<s n="....">', b'', sentence)
    #remove other random characters
    sentence = re.sub(b'>', b'', sentence)
    #convert back to character string
    sentence = sentence.decode()
    #strip trailing whitespace
    sentence = sentence.strip()
    #spacing for syllables
    sentence = "".join(sentence.split())
    words = re.findall('.*?[\d+]', sentence)
    sentence = " ".join(words)
    
    sentences.append(sentence)

sentences[:10]

['da4 qiang2 nei4 wai4 bei3 jing1 shi4 jian1 yu4 ji4 shi2 san1',
 'tian2 zhen1 ying3',
 'huo4 xu3 yan2 tong2 yi3 zai4 zhe4 mang2 lu4 zhong1 kai1 shi3 suan4 qing1 le5 wei2 na3 jiu3 shi2 wan4 jin1 liang2 piao4 ta1 ying4 fu4 chu1 de5 ju4 da4 dai4 jia4',
 'ta1 you4 cong2 zhe4 dai4 jia4 zhong1 xue2 xi2 zhao2 zai4 xue2 xiao4 zhong1 cong2 wei4 xue2 guo4 de5 zhi1 shi2 xue2 xi2 zhao2 ren2 sheng1',
 'jiu4 xiao4 li4 ge4 ren2 er2 yan2 ta1 zou3 zhao2 yi1 tiao2 bei1 ju4 shi4 de5 lu4',
 'ruo4 cong2 she4 hui4 de5 jiao3 du4 lai2 kan4 xiao4 li4 de5 pan4 xing2 shi4 jiao4 yu4 de5 shi1 bai4 jia1 ting2 de5 xue2 xiao4 de5',
 'fu4 mu3 qi1 wang4 er2 zi5 cheng2 long2',
 'zhong4 dian3 zhong1 xue2 xue2 sheng1 de5 gui4 guan4 shi3 ta1 yi1 men2 xin1 si1 ba3 zi4 ji3 de5 qian2 tu2 he2 qing1 hua2 bei3 da4 jin3 lian2 yi1 qi3 ta1 chou2 chu2 man3 zhi4 jue2 de5 sheng4 quan4 zai4 wo4',
 'dan4 shi4 gao1 zhong1 bi4 ye4 ti3 jian3 biao3 shang4 de5 se4 mang2 er4 zi4 ba3 ta1 da3 de5 yun1 tou2 zhuan3 xiang4 ta1 bu4 de5 yi3 bao4 le

In [4]:
#word tokenize corpus
tokenized_sentences = [sentence.split() for sentence in sentences]
tokenized_sentences[:1]

[['da4',
  'qiang2',
  'nei4',
  'wai4',
  'bei3',
  'jing1',
  'shi4',
  'jian1',
  'yu4',
  'ji4',
  'shi2',
  'san1']]

In [5]:
#train language model
train, vocabulary = padded_everygram_pipeline(2, tokenized_sentences)
lm = MLE(2)
lm.fit(train, vocabulary)

In [6]:
# #preprocess sample sentence
# sentence = "Wo3 you3dian3 kun4huo4" #means: I'm a little confused
# #all lowercase
# sentence = sentence.lower().split()
# #turn into ngram tuples
# sentence = list(nltk.ngrams(sentence, 2))
# sentence

In [7]:
#get perplexity of sample sentence
# perplexity = lm.perplexity(sentence)
# perplexity

In [8]:
with open('cedict_ts.u8', encoding = 'utf-8') as file:
    text = file.read()
    lines = text.split('\n')
    dict_lines = list(lines)

In [9]:
def parse_line(line):
        if line == '':
            dict_lines.remove(line)
            return 0
        line = line.rstrip('/')
        line = line.rstrip('/')
        if len(line) <= 1:
            return 0
        p = line.split(']')[0]
        e = line.split(']')[1]
        e = re.sub(r'\([^)]*\)', '', e).split('(')[0]
        e2 = e.strip().rstrip('/').lstrip('/').split('/')
        english = [lemmatizer.lemmatize(word) for word in e2]
        pinyin = p.split('[')[1]
        
        pinList.append(pinyin)
        engList.append(english)

In [10]:
pinList = []
engList = []
for line in dict_lines:
    parse_line(line)

In [11]:
z = list(zip(pinList, engList))
z[1200:1300]

[('san1 shi2 liu4 zi4 mu3',
  ['thirty six initial consonants of Song phonetic theory']),
 ('san1 shi2 liu4 ji4',
  ['The Thirty-Six Stratagems, a Chinese essay used to illustrate a series of stratagems used in politics, war, and in civil interaction',
   'all the possible schemes and stratagems']),
 ('san1 shi2 liu4 ji4 , zou3 wei2 shang4 ce4',
  ['of the thirty-six stratagems, the best option is to flee ',
   'the best thing to do is leave']),
 ('san1 shi2 liu4 ji4 , zou3 wei2 shang4 ji4',
  ['see 三十六計，走為上策|三十六计，走为上策[san1 shi2 liu4 ji4 , zou3 wei2 shang4 ce4']),
 ('san1 shi2 nian2 he2 dong1 , san1 shi2 nian2 he2 xi1',
  ['the river flows thirty years to the East, thirty years to the West ',
   'change is the only constant']),
 ('san1 shi2 er2 li4', ['thirty years old and therefore independent']),
 ('san1 qian1 da4 qian1 shi4 jie4', ['cosmos']),
 ('San1 yuan2', ['Sanyuan County in Xianyang 咸陽|咸阳[Xian2 yang2']),
 ('san1 yuan2 ze2', ['the Three Principles']),
 ('San1 yuan2 Xian4', ['San

In [12]:
#     #grade word order
#     #all lowercase
#     chi = chi.lower()
#     #add sentence boundaries
#     #test = "</s> {} </s>".format(test)
#     #split into list of words
#     chi = chi.split()
#     #turn into ngram tuples
#     chi = list(nltk.ngrams(chi, n))
    
#     print(f"perplexity = {lm.entropy(chi)}")

In [13]:
pairs = []

In [14]:
for sentence, translation in ParallelCorpus("cmn", "eng"):
        #characters to pinyin
        p = pinyin.get(sentence.text, format = 'numerical')
        pinyinString = " ".join(re.findall('.*?[\d+]', p))
        #chinese english pairs
        pairs.append((pinyinString, translation.text))       

In [15]:
pair = random.choice(pairs)
pair

('wo3 wan2 quan2 bu4 ren4 shi4 na4 ge4 nv3 ren2',
 "I absolutely don't know that woman.")

In [16]:
s = pair[0]
#s = 'jin1 nian2 wu3 yue4'

#remove punctuation
s = re.sub('\W', "", s)
words = re.findall('.*?[\d+]', s)
s = " ".join(words)

total = 0
everygram = list(list(padded_everygram_pipeline(2, [s.split()])[0])[0])
for gram in everygram:
    if len(gram) == 1:
        pass
#         print(gram[0])
#         print(lm.counts[gram[0]])
#         print()
    else:
        print(gram)
        print(lm.counts[[gram[0]]][gram[1]])
        print()
        total += lm.counts[[gram[0]]][gram[1]]
        average = total/len(s.split())
print(average)

('<s>', 'wo3')
1717

('wo3', 'wan2')
10

('wan2', 'quan2')
283

('quan2', 'bu4')
216

('bu4', 'ren4')
39

('ren4', 'shi4')
17

('shi4', 'na4')
1

('na4', 'ge4')
3

('ge4', 'nv3')
0

('nv3', 'ren2')
0

('ren2', '</s>')
544

283.0


In [26]:
def grade(eng, chi):
    #grade content words
    translation = nltk.word_tokenize(chi)
    english = eng.lower()
    english = nltk.word_tokenize(english)

    lemmaTranslation = [lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = 'v'), pos = 'n') for word in english]
    
    for i in range(len(pinList)):
        if pinList[i] == 'wan2 quan2':
            print(engList[i])

In [27]:
grade(pair[1], 'wo3 wan2 quan2 bu4 ren4 shi4 na4 ge4 nv3 ren2')

['complete', 'whole', 'totally', 'entirely']
