In [1]:
# Statistical Machine Translation System
# English to Cantonese

# IBM Model 1 for Word Translation Task
# Word Alignment based on Relative Positions
# Bi-gram Language Modelling with Laplace Smoothing and Backoff

In [2]:
!pip install nltk
!pip install -q tensorflow



In [3]:
import pickle
import tensorflow as tf

## Input Data

In [4]:
import pandas as pd
import glob
import os

all_csv = glob.glob(os.getcwd() + "/train.csv")  

df_test = pd.read_csv(os.getcwd() + "/test.csv", sep='\t', encoding='utf-8')  

df_from_each_file = (pd.read_csv(f, sep='\t', encoding='utf-8') for f in all_csv)
df = pd.concat(df_from_each_file)

# Check for null
df[df['yue'].isnull()]
df = df.dropna()
df_test = df_test.dropna()

YueChar = True
# Delete spaces between n-gram in Cantonese
# Perform Character based tokenization in Cantonese
if YueChar:
    df['yue'] = df['yue'].str.replace(r' ', '')
    df_test['yue'] = df_test['yue'].str.replace(r' ', '')

df.head()

Unnamed: 0,yue,eng
0,唔準掂同食醃菜唔準坐梳化或者屋企人嘅床上每次經期完咗要洗床單就算床單無汚糟到他們認為我唔純潔...,I was not allowed to touch or eat pickles I wa...
1,佢地扮怪面嚇人,They were making scary faces
2,唔會搞到我哋變得邪惡女人型變成自由黨人咩,turn us into godless sissy liberals
3,呢個模式可以清晰令您瞭解佢哋,So it spells those out in very clean terms
4,幾多萬億掌聲,How many trillions Applause


In [5]:
import regex as re
def spliteKeyWord(str):
    regex = r"[\u4e00-\ufaff]|[0-9]+|[a-zA-Z]+\'*[a-z]*"
    matches = re.findall(regex, str, re.UNICODE)
    return ' '.join(matches)

if YueChar:
    df['yue'] = df['yue'].apply(lambda x: spliteKeyWord(x))
    df_test['yue'] = df_test['yue'].apply(lambda x: spliteKeyWord(x))
df_test.head()

Unnamed: 0,yue,eng
0,我 相 信 主,I believe the almighty
1,好 耐 以 嚟 有 發 展 紊 亂 嘅 小 朋 友,For too long now children with developmental d...
2,一 般 會 遇 到 兩 種 反 應,I have two kinds of reactions
3,但 再 諗 下 嗰 位 官 員 未 必 係 唯 一 睇 小 女 性 嘅 人 呢 種 偏 見 ...,But think about this The IMF official is hardl...
4,佢 將 呢 個 病 毒 傳 畀 BB,She passes that virus on to baby


In [6]:
MAX_LENGTH = 40
inp_lang = 'eng'
tar_lang = 'yue'

def df_filter(df):
    return df[
        df['yue'].apply(lambda x: len(x.split())<MAX_LENGTH) &
        df['eng'].apply(lambda x: len(x.split())<MAX_LENGTH) 
    ]
df = df_filter(df)
df_test = df_filter(df_test)

inp = [i.split() for i in df[inp_lang].to_list()]
tar = [i.split() for i in df[tar_lang].to_list()]
inp_test = [i.split() for i in df_test[inp_lang].to_list()]
tar_test = [i.split() for i in df_test[tar_lang].to_list()]
inp[:3], tar[:3]

([['They', 'were', 'making', 'scary', 'faces'],
  ['turn', 'us', 'into', 'godless', 'sissy', 'liberals'],
  ['So', 'it', 'spells', 'those', 'out', 'in', 'very', 'clean', 'terms']],
 [['佢', '地', '扮', '怪', '面', '嚇', '人'],
  ['唔',
   '會',
   '搞',
   '到',
   '我',
   '哋',
   '變',
   '得',
   '邪',
   '惡',
   '女',
   '人',
   '型',
   '變',
   '成',
   '自',
   '由',
   '黨',
   '人',
   '咩'],
  ['呢', '個', '模', '式', '可', '以', '清', '晰', '令', '您', '瞭', '解', '佢', '哋']])

In [7]:
train_size = len(inp)
test_size = len(inp_test)
train_size, test_size

(6851, 1696)

### Vocabulary

In [8]:
inp_words = {}
tar_words = {}

for sentence in inp:
    for word in sentence:
        if word in inp_words:
            inp_words[word] += 1
        else:
            inp_words[word] = 1
            
for sentence in tar:
    for word in sentence:
        if word in tar_words:
            tar_words[word] += 1
        else:
            tar_words[word] = 1
                    
inp_vocab = len(inp_words)
tar_vocab = len(tar_words)
print("Number of Unique Words:")
print(inp_lang, ':', str(inp_vocab))
print(tar_lang, ':', str(tar_vocab))

Number of Unique Words:
eng : 6219
yue : 2705


# Training and checkpointing

In [9]:
# creating the 't'
t = {}
# usage: t[('inp_word', 'tar_word')] = probability of inp_Word given tar_word
uniform = 1 / (inp_vocab * tar_vocab)

hyperparameter

In [None]:
max_iters = 128
word_factor_max = 4

fine_tune = 1
has_converged = False

In [10]:
run_id = f"SMT-4_EngYue"
log_dir = os.path.join(os.path.join(os.getcwd(), 'log'), run_id)
%load_ext tensorboard
%tensorboard --logdir {log_dir}

ERROR: Failed to launch TensorBoard (exited with 2).
Contents of stderr:
2021-04-16 05:32:01.937073: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-04-16 05:32:01.937209: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--purge_orphaned_data BOOL] [--db URI] [--db_import]
                   [--inspect] [--version_tb] [--tag TAG] [--event_file PATH]
                   [--path_prefix PATH] [--window_title TEXT]
                   [--max_reload_threads COUNT] [--reload_interval SECONDS]
                   [--reload_task TYPE] [--reload_multifile BOOL]
                   [--reload_multifile_inactive_secs SECONDS]
                   [--generic_data

In [12]:
import time
n_iters = 0

while n_iters < max_iters and has_converged == False:
    start = time.time()
    has_converged = True
    max_change = -1

    n_iters += 1
    count = {}
    total = {}
    for index in range(train_size):
        s_total = {}
        for inp_word in inp[index]:
            s_total[inp_word] = 0
            for tar_word in tar[index]:
                if (inp_word, tar_word) not in t:
                    t[(inp_word, tar_word)] = uniform
                s_total[inp_word] += t[(inp_word, tar_word)]

        for inp_word in inp[index]:
            for tar_word in tar[index]:
                if (inp_word, tar_word) not in count:
                    count[(inp_word, tar_word)] = 0
                count[(inp_word, tar_word)] += (t[(inp_word, tar_word)] / s_total[inp_word])

                if tar_word not in total:
                    total[tar_word] = 0
                total[tar_word] += (t[(inp_word, tar_word)] / s_total[inp_word])

    # estimating the probabilities

    if fine_tune == 0:
      updated = {}
      # train for all valid word pairs s.t count(inp_word, tar_word) > 0
      for index in range(train_size):
          for tar_word in tar[index]:
              for inp_word in inp[index]:
                  if (inp_word, tar_word) in updated:
                      continue
                  updated[(inp_word, tar_word)] = 1
                  if abs(t[(inp_word, tar_word)] - count[(inp_word, tar_word)] / total[tar_word]) > 0.01:
                      has_converged = False
                      max_change = max(max_change, abs(t[(inp_word, tar_word)] - count[(inp_word, tar_word)] / total[tar_word]))
                  t[(inp_word, tar_word)] = count[(inp_word, tar_word)] / total[tar_word]

    elif fine_tune == 1:
      # train it only for 1000 most frequent words in English and Cantonese
      n_tar_words = 0
      updates = 0

      for tar_word_tuples in sorted(tar_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
          tar_word = tar_word_tuples[0]
          n_tar_words += 1
          if n_tar_words > len(tar_words)/word_factor_max:
              break
          n_inp_words = 0
          for inp_word_tuples in sorted(inp_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
              inp_word = inp_word_tuples[0]
              n_inp_words += 1
              if n_inp_words > len(inp_words)/word_factor_max:
                  break
              if (inp_word, tar_word) not in count or tar_word not in total:
                  continue
                  # assume in this case: t[(inp_word, tar_word)] = uniform
              else:
                  if abs(t[(inp_word, tar_word)] - count[(inp_word, tar_word)] / total[tar_word]) > 0.005:
                      has_converged = False
                      max_change = max(max_change, abs(t[(inp_word, tar_word)] - count[(inp_word, tar_word)] / total[tar_word]))
                  t[(inp_word, tar_word)] = count[(inp_word, tar_word)] / total[tar_word]
                
    summary_writer = tf.summary.create_file_writer(log_dir)
    with summary_writer.as_default():
        tf.summary.scalar("Change", max_change, step=n_iters)

    print("Iteration " + str(n_iters) + " Completed, Maximum Change: " + str(max_change) + '\nTime: {} secs'.format(time.time() - start))


Iteration 1 Completed, Maximum Change: 0.10061728323390656
Time: 14.505329608917236 secs
Iteration 2 Completed, Maximum Change: 0.059777312117207104
Time: 14.042763948440552 secs
Iteration 3 Completed, Maximum Change: 0.038890445614923885
Time: 11.707644939422607 secs
Iteration 4 Completed, Maximum Change: 0.031511048716855894
Time: 14.11092209815979 secs
Iteration 5 Completed, Maximum Change: 0.026656764333911676
Time: 13.092541456222534 secs
Iteration 6 Completed, Maximum Change: 0.023829517510665296
Time: 11.382084608078003 secs
Iteration 7 Completed, Maximum Change: 0.02870980011454298
Time: 16.10074019432068 secs
Iteration 8 Completed, Maximum Change: 0.028258624414520178
Time: 13.923482656478882 secs
Iteration 9 Completed, Maximum Change: 0.02338470113218183
Time: 12.678022623062134 secs
Iteration 10 Completed, Maximum Change: 0.017378590410586287
Time: 11.37423062324524 secs
Iteration 11 Completed, Maximum Change: 0.012408867866056161
Time: 12.32548189163208 secs
Iteration 12 Co

In [13]:
# displaying the most confident translation pairs
limit = 40
for element in sorted(t.items(), key = lambda k:(k[1], k[0]), reverse = True):
  print(element)
  limit -= 1
  if limit <= 0:
    break

(('Applause', '掌'), 0.9815791241298171)
(('Guitar', '吉'), 0.9688366279043082)
(('Laughter', '笑'), 0.9393978572985704)
(('or', '或'), 0.817416290652338)
(('fell', '跌'), 0.7248889905258226)
(('cancer', '癌'), 0.7102642291999152)
(('play', '玩'), 0.6940455900997853)
(('and', '和'), 0.6539258955800603)
(('I', '我'), 0.6445171903608035)
(('because', '因'), 0.634531054646612)
(('you', '你'), 0.6324926858713258)
(('and', '及'), 0.6134175614713324)
(('like', '似'), 0.6084316639120902)
(('serious', '嚴'), 0.6048784165103847)
(('information', '息'), 0.6043693153370403)
(('Applause', '鼓'), 0.5975765159157658)
(('apartment', '寓'), 0.5907856128049263)
(('Thank', '謝'), 0.5896039851766257)
(('Bible', '聖'), 0.5879872359032474)
(('green', '綠'), 0.5877173906474982)
(('love', '愛'), 0.5634707525522883)
(('revolution', '革'), 0.5556046725470167)
(('carousel', '迴'), 0.5516193436696922)
(('know', '知'), 0.549194185436135)
(('Jesus', '穌'), 0.5424091988607987)
(('EEG', 'EEG'), 0.53769647313886)
(('two', '兩'), 0.53642172265

In [14]:
# saving the translation model
file = open("translation_model.pkl","wb")
pickle.dump(t, file)
file.close()

In [15]:
# using the model trained until convergence
# to use a saved model
model_name = "translation_model.pkl"
pickle_in = open(model_name,"rb")
t = pickle.load(pickle_in)

In [16]:
I = {}
for index in range(train_size):
    for inp_id in range(len(inp[index])):
        length = len(inp[index])
        if length not in I:
            I[length] = {} # maps the positional difference to a tuple: (sum of t's, count)
        for tar_id in range(len(tar[index])):
            if (tar_id - inp_id) not in I[length]:
                I[length][(tar_id - inp_id)] = [t[(inp[index][inp_id], tar[index][tar_id])], 1]
            else:
                I[length][(tar_id - inp_id)][0] += t[(inp[index][inp_id], tar[index][tar_id])]
                I[length][(tar_id - inp_id)][1] += 1

In [17]:
# viewing the available sentence lengths encountered during training
sentence_lengths = []
for key in I.keys():
    if key not in sentence_lengths:
        sentence_lengths.append(key)
sentence_lengths.sort()
print(sentence_lengths)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 37, 38]


In [18]:
# computing the alignment probabilities
# p[I][tar_id - inp_id] = p(i | i', I)

p = {}
for key in I.keys():
    p[key] = {}
    sum_val = 0
    for diff in I[key].keys():
        p[key][diff] = I[key][diff][0] / I[key][diff][1]
        sum_val += p[key][diff]
    for diff in p[key].keys():
        p[key][diff] /= sum_val

In [19]:
for index in range(train_size):
    length_inp = len(inp[index])
    length_tar = len(tar[index])
    if length_tar - length_inp > 10 and length_inp == 1:
        print("Length of English Sentence:", str(length_inp))
        print("Length of Cantonese Sentence:", str(length_tar))
        
# there exists an English sentence with one token s.t the Cantonese translation contains 19 tokens

Length of English Sentence: 1
Length of Cantonese Sentence: 14
Length of English Sentence: 1
Length of Cantonese Sentence: 13


In [20]:
# computing initial transitions
init = {}
for length in p:
    max_prob = -1
    max_jump = 0
    for key in p[length].keys():
        if p[length][key] > max_prob:
            max_prob = p[length][key]
            max_jump = key
    init[length] = max_jump

In [21]:
# computing the transition probabilities for Cantonese
bigrams = {}
unigrams = {}

# training on the train_set
def model(dataset_size, dataset_name):
    global bigrams
    global unigrams
    for index in range(dataset_size):
        token_A = ''
        for tar_token in tar[index]:
            if tar_token not in unigrams:
                unigrams[tar_token] = 1
            else:
                unigrams[tar_token] += 1
            
            token_B = tar_token
            if (token_A, token_B) not in bigrams:
                bigrams[(token_A, token_B)] = 1
            else:
                bigrams[(token_A, token_B)] += 1
            token_A = token_B

model(train_size, 'tar_train')

bigram_count = len(bigrams)
unigram_count = len(unigrams)
print("Number of Unique Bigrams:", bigram_count)
print("Number of Unique Unigrams:", unigram_count)

Number of Unique Bigrams: 27343
Number of Unique Unigrams: 2705


In [22]:
from itertools import permutations

computed_sentences = []
total_BLEU = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 7: 0}
null_BLEU_count = 0

sorted_t = sorted(t.items(), key = lambda k:(k[1], k[0]), reverse = True)

def find_translation(inp_token):
    for element in sorted_t:
        if element[0][0].lower() == inp_token:
            return element[0][1]
    return ""

def get_prob(seq):
    # bigram language model with laplace smoothing and backoff
    if len(seq) < 2 or len(seq) > 10:
        return 1
    score = 0
    token_A = ''
    for tar_token in seq:
        token_B = tar_token
        if (token_A, token_B) not in bigrams:
            if token_B not in unigrams:
                continue
            else:
                score += unigrams[token_B] / unigram_count
        else:
            base_token_count = 0
            if token_A in unigrams:
                base_token_count = unigrams[token_A]
            score += (bigrams[(token_A, token_B)] + 1) / (base_token_count + unigram_count)
        token_A = token_B
    return score

In [23]:
TotalTime = time.time()
start = time.time()

ori = []
ref = []
can = []
num_test = 0

for index in range(test_size):

    translated_words = []
    for inp_token in inp_test[index]:
        translation = find_translation(inp_token)
        if translation != "":
            translated_words.append(translation)
    perm = permutations(translated_words)
    best_seq = translated_words
    best_prob = -1

    if len(inp_test[index]) < 10 and len(inp_test[index]) > 2:
        for seq in perm:            
            prob = get_prob(seq)
            if prob > best_prob:
                best_prob = prob
                best_seq = seq
            
    ori.append(inp_test[index])
    ref.append(tar_test[index])
    can.append(best_seq)
    num_test += 1
    
    if num_test % 50 == 0:
        print(f'Progress: {num_test} / {test_size} in {time.time()-start} s')
        start = time.time()
        
print(f'\nToTal: {num_test} / {test_size} in {time.time()-TotalTime} s')

Progress: 50 / 1696 in 7.060065507888794 s
Progress: 100 / 1696 in 6.184078693389893 s
Progress: 150 / 1696 in 9.875119686126709 s
Progress: 200 / 1696 in 8.030096769332886 s
Progress: 250 / 1696 in 5.2630932331085205 s
Progress: 300 / 1696 in 6.4435882568359375 s
Progress: 350 / 1696 in 9.843735218048096 s
Progress: 400 / 1696 in 5.851065635681152 s
Progress: 450 / 1696 in 7.193043231964111 s
Progress: 500 / 1696 in 7.778099298477173 s
Progress: 550 / 1696 in 6.250096321105957 s
Progress: 600 / 1696 in 6.352885007858276 s
Progress: 650 / 1696 in 7.525693416595459 s
Progress: 700 / 1696 in 6.998638391494751 s
Progress: 750 / 1696 in 7.3835060596466064 s
Progress: 800 / 1696 in 9.561376571655273 s
Progress: 850 / 1696 in 8.418327569961548 s
Progress: 900 / 1696 in 7.2670738697052 s
Progress: 950 / 1696 in 8.114732265472412 s
Progress: 1000 / 1696 in 7.295686721801758 s
Progress: 1050 / 1696 in 7.541095495223999 s
Progress: 1100 / 1696 in 4.974968671798706 s
Progress: 1150 / 1696 in 7.36

In [29]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

import random
ran = random.randint(1, len(ref))
print(ori[ran])
print(ref[ran])
print(can[ran])

smoothie = SmoothingFunction().method4
print('BLEU:', sentence_bleu(ref[ran], can[ran], smoothing_function=smoothie)*100)

['In', 'time', 'the', 'doctor', 's', 'potent', 'potion', 'and', 'uncanny', 'instincts', 'became', 'known', 'throughout', 'the', 'land', 'He', 'grew', 'rich', 'and', 'famous', 'casting', 'off', 'the', 'hardships', 'of', 'his', 'early', 'life']
['而', '契', '仔', '只', '可', '以', '就', '手', '旁', '觀', '憑', '住', '呢', '樽', '藥', '水', '同', '神', '奇', '嘅', '直', '覺', '契', '仔', '好', '快', '就', '成', '為', '遠', '近', '聞', '名', '嘅', '神', '醫']
['次', '界', '醫', '墨', '和', '豆', '深', '麥', '界', '土', '長', '逐', '和', '過', '熄', '界', '標', '蠟', '右', '命']
BLEU: 14.0817820723198


In [30]:
# Sentence-based and average score
score = 0
# for i in range(len(ref)):
#     r = ref[i]
#     c = can[i]
#     score += sentence_bleu(r, c, smoothing_function=smoothie)*100
# print('BLEU-s:', score/len(ref))

# Corpus based, summing all nominator and denominator before division
# r = [[r.split()] for r in ref]
# c = [c.split() for c in can]
score = corpus_bleu(ref, can, smoothing_function=smoothie)*100
print('BLEU-c:', score)

BLEU-c: 0.1733453072847488
