<a href="https://colab.research.google.com/github/hey0wing/Machine-Translation-models-for-Cantonese-English-pair/blob/main/FYP_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import glob
!pip install --upgrade pycantonese
import pycantonese as pc
import re
import unicodedata
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [29]:
# link to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [90]:
# Defineing variables
vocab_size = 10000
embedding_dim = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
train_size_ratio = 0.8

In [91]:
# read corpus data
corpus = []
list_of_files = glob.glob('/content/drive/My Drive/FYP corpus/*_double.txt')
for file_name in list_of_files:
  f = open(file_name, 'r')
  corpus.extend([x for x in f.read().strip().splitlines() if x])
  f.close()
corpus_size = int(len(corpus)/2)
print("Corpus size: " + str(corpus_size))
training_size = int(corpus_size*train_size_ratio)

# separate Cantonese and English data
corpus_yue = corpus[::2]
corpus_eng = corpus[1::2]
print(corpus_yue[0:9])
print(corpus_eng[0:9])

Corpus size: 322
['打還打嗎', '係要摷亂個頭', '咁梗係啦', '唔係幫手做咩呀', '咁就可以講個笑話啦', '我驚你今日冇咁順利', '你驚我返嚟搞破壞？', '唔會啩咁秘密', '我唔係講你daddy呀']
['Fighting is okay.', "But don't mess up my hair.", 'Of course.', "Or else, I wouldn't ask for your help.", 'Then, can I be honest?', 'I have a bad feeling about this.', 'Will Dad stop the wedding?', 'How will he find out?', 'Forget about your dad.']


In [61]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()
  return w

In [63]:
# segmentation of Cantonese
preprocess_yue = []
for sentence_yue in corpus_yue:
  preprocess_yue.append(pc.segment(sentence_yue))
print(preprocess_yue[0:10])

# segmentation of English
preprocess_eng = []
for sentence_eng in corpus_eng:
  preprocess_eng.append(preprocess_sentence(sentence_eng))
print(preprocess_eng[0:10])  

[['打', '還', '打', '嗎'], ['係要', '摷', '亂', '個頭'], ['咁', '梗係啦'], ['唔係', '幫手', '做咩', '呀'], ['咁', '就可以', '講', '個', '笑話', '啦'], ['我', '驚', '你', '今日', '冇', '咁', '順利'], ['你', '驚', '我', '返嚟', '搞', '破壞', '？'], ['唔會啩', '咁', '秘密'], ['我', '唔係', '講', '你', 'daddy', '呀'], ['我', '講', '新郎哥', '個', '小姐', '放心', '啦']]
['fighting is okay .', 'but don t mess up my hair .', 'of course .', 'or else , i wouldn t ask for your help .', 'then , can i be honest ?', 'i have a bad feeling about this .', 'will dad stop the wedding ?', 'how will he find out ?', 'forget about your dad .', 'where s your groom ?']


In [65]:
# Separating training and testing set
training_sentences_yue = preprocess_yue[0:training_size]
testing_sentences_yue = preprocess_yue[training_size:]

training_sentences_eng = preprocess_eng[0:training_size]
testing_sentences_eng = preprocess_eng[training_size:]

In [94]:
# tokenizing and padding for Cantonese
tokenizer_yue = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_yue.fit_on_texts(training_sentences_yue)
word_index_yue = tokenizer_yue.word_index

training_sequences_yue = tokenizer_yue.texts_to_sequences(training_sentences_yue)
testing_sequences_yue = tokenizer_yue.texts_to_sequences(testing_sentences_yue)
max_length_yue = len(max(training_sequences_yue, key=len))

training_padded_yue = pad_sequences(training_sequences_yue, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded_yue = pad_sequences(testing_sequences_yue, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print("Padding size for Cantonese: " + str(max_length_yue))
print(training_padded_yue)

# tokenizing and padding for Cantonese
tokenizer_eng = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_eng.fit_on_texts(training_sentences_eng)
word_index_eng = tokenizer_eng.word_index

training_sequences_eng = tokenizer_eng.texts_to_sequences(training_sentences_eng)
testing_sequences_eng = tokenizer_eng.texts_to_sequences(testing_sentences_eng)
max_length_eng = len(max(training_sequences_eng, key=len))

training_padded_eng = pad_sequences(training_sequences_eng, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded_eng = pad_sequences(testing_sequences_eng, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print("Padding size for English: " + str(max_length_eng))
print(training_padded_eng)

Padding size for Cantonese: 27
[[ 47  79  47 ...   0   0   0]
 [229 230 231 ...   0   0   0]
 [  7 118   0 ...   0   0   0]
 ...
 [ 35 224  26 ...   0   0   0]
 [  2 792   2 ...   0   0   0]
 [796   8   3 ...   0   0   0]]
Padding size for English: 24
[[107  13 242 ...   0   0   0]
 [ 28  38   5 ...   0   0   0]
 [ 24 153   0 ...   0   0   0]
 ...
 [ 50   6 144 ...   0   0   0]
 [ 46  39   2 ...   0   0   0]
 [ 10 165   5 ...   0   0   0]]


In [None]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()


In [None]:
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(training_padded[0]))
print(training_sentences[2])
print(labels[2])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

In [None]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))