In [1]:
import random
import nltk
from nltk.util import ngrams
from nltk.lm import Vocabulary
from nltk.lm.models import Laplace
from nltk.tokenize import word_tokenize
import numpy as np

# Read the text corpus
with open('khmer_food.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()

# Split the corpus into sentences
sentences = nltk.sent_tokenize(corpus)

In [2]:
sentences

['Bai domram is a rice dish served with multiple side dishes meant to be eaten with it.',
 'It is prepared by allowing the cooked rice to cool overnight and absorb the morning dew.',
 'This process infuses the rice with a subtle essence.',
 'To enhance the aroma, jasmine flowers are added to the dish in the morning.',
 'During the dry season, bai domram is appreciated for its cooling effect.',
 'Amok is a popular Cambodian dish that is typically made with fish.',
 'The fish is marinated in a mixture of spices and coconut milk, then steamed in banana leaves.',
 'The result is a fragrant, flavorful dish that can be enjoyed with rice or noodles.',
 'While fish is the traditional ingredient for amok, chicken and vegetarian versions have become increasingly popular.',
 'No matter what type of amok you try, it is sure to be a delicious experience.',
 'Nataing is a dip made with minced pork, coconut cream, and peanuts.',
 'It is traditionally enjoyed alongside crispy rice cakes.',
 'A variati

In [3]:
# Split the corpus into training, validation, and testing sets
train_size = int(0.7 * len(sentences))
val_size = int(0.1 * len(sentences))
train_set = sentences[:train_size]
val_set = sentences[train_size:train_size+val_size]
test_set = sentences[train_size+val_size:]


In [35]:
test_set

['It is traditionally enjoyed alongside crispy rice cakes.',
 'A variation of nataing favored by the Khmer royalty uses chicken meat that has been finely strained before cooking.',
 'An even more extravagant version incorporates lobster as the main ingredient.',
 'Muk mee is a Khmer-style salad made of fried rice vermicelli, from which a wide array of toppings is added.']

In [4]:
from collections import defaultdict
def load_vocab(dataset):
    '''
    parameters:
    filename (string): datafile
    
    Returns:
    data (list of lists): each list is a sentence of the text 
    vocab (dictionary): {word: no of times it appears in the text}
    '''
 
    data = []
    vocab = defaultdict(lambda:0)
    for line in dataset:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [5]:
print("load training set..")
print("\n")
train_data,vocab = load_vocab(train_set)
print(train_data[0])
print("\n")
print("how :",vocab['is'])
print("load validation set")
valid_data, _ = load_vocab(val_set)

load training set..


['Bai', 'domram', 'is', 'a', 'rice', 'dish', 'served', 'with', 'multiple', 'side', 'dishes', 'meant', 'to', 'be', 'eaten', 'with', 'it.']


how : 9
load validation set


In [6]:
def remove_rare_words(data, vocab, mincount = 1):
    '''
    Parameters:
    data (list of lists): each list is a sentence of the text 
    vocab (dictionary): {word: no of times it appears in the text}
    mincount(int): the minimum count 
    
    Returns: 
    data_with_unk(list of lists): data after replacing rare words with <unk> token
    '''
    # replace words in data that are not in the vocab 
    # or have a count that is below mincount
    data_with_unk = []
    ## FILL CODE
    for sentence in data:
        sentence_with_unk = []
        for word in sentence:
            if word in vocab and vocab[word]>mincount:
                sentence_with_unk.append(word)
            else:
                sentence_with_unk.append('<unk>')
        data_with_unk.append(sentence_with_unk)
    
    return data_with_unk

In [7]:
print("remove rare words")
train_data = remove_rare_words(train_data, vocab, mincount = 1)
valid_data = remove_rare_words(valid_data, vocab, mincount = 1)

remove rare words


In [8]:
train_data[0]

['<unk>',
 'domram',
 'is',
 'a',
 'rice',
 'dish',
 '<unk>',
 'with',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 'to',
 'be',
 '<unk>',
 'with',
 '<unk>']

In [9]:
def build_ngram(data, n):
    '''
    Parameters:
    data (list of lists): each list is a sentence of the text 
    n (int): size of the n-gram
    
    Returns:
    proba (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }
    
    
    '''
    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for sentence in data:
        sentence = tuple(sentence)
        ## FILL CODE
        # dict can be indexed by tuples
        # store in the same dict all the ngrams
        # by using the context as a key and the word as a value
        for i in range(len(sentence)):
            total_number_words +=1
            for k in range(n):
                if i-k < 0:
                    break
                counts[sentence[i-k:i]][sentence[i]] +=1 
                           

    proba  = defaultdict(lambda: defaultdict(lambda: 0.0))
    # Build the probabilities from the counts
    # Be careful with how you normalize!

    for context in counts.keys():
    ## FILL CODE
        denom =0
        for w in counts[context].keys():
            denom += counts[context][w]
        for w in counts[context].keys():
            proba[context][w] = counts[context][w]/denom 
    
    return proba

In [10]:
# RUN TO BUILD NGRAM MODEL

n = 4
print("build ngram model with n = ", n)
model = build_ngram(train_data, n)

build ngram model with n =  4


In [11]:
def get_prob(model, context, w):
    '''
    Parameters: 
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    } 
    context (list of strings): a sentence
    w(string): the word we need to find it's probability given the context
    
    Retunrs:
    prob(float): probability of this word given the context 
    '''

    # code a recursive function over 
    # smaller and smaller context
    # to compute the backoff model
    
    ## FILL CODE

    if context in model and w in model[context]:
        return model[context][w]
    else:
        return 0.4*get_prob(model, context[1:], w)

In [12]:
import math
def perplexity(model, data, n):
    '''
    Parameters: 
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    } 
    data (list of lists): each list is a sentence of the text
    n(int): size of the n-gram
    
    Retunrs:
    prep(float): the preplexity of the model 
    '''
    ## FILL CODE
    perp, T = 0.0, 0
    for sentence in data:
        sentence = tuple(sentence)
        for i in range(1, len(sentence)):
            nc = min(n-1, i)
            context = sentence[i-nc:i]
            perp += -math.log(get_prob(model, context, sentence[i]))
            T += 1
    perp = math.exp(perp/T)
    return perp

In [13]:
# COMPUTE PERPLEXITY ON VALIDATION SET

print("The perplexity is", perplexity(model, valid_data, n=3))

The perplexity is 2.987014940593381


In [14]:
def get_proba_distrib(model, context):
    ## need to get the the words after the context and their probability of appearance
    ## after this context 
    '''
    Parameters: 
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }
    context (list of strings): the sentence we need to find the words after it and 
    thier probabilites
    
    Retunrs:
    words_and_probs(dic): {word: probability of word given context}
    
    '''
    # code a recursive function over context
    # to find the longest available ngram
    
    ## FILL CODE
    
    if context in model:
        return model[context]
    else:
        return get_proba_distrib(model, context[1:])

In [49]:
def generate(model):
    '''
    Parameters: 
    model (dictionary of dictionary)
    {
        context: {word:probability of this word given context}
    }
    
    Retunrs:
    sentence (list of strings): a sentence sampled according to the language model. 
    

    '''
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    # np.random.choice(x, 1, p = y)

    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    sentence = ["It"]
    while sentence[-1] != "rice" and len(sentence)<10:
        ## FILL CODE
        proba = get_proba_distrib(model, tuple(sentence))
        w = np.random.choice((list(proba.keys())), 1, p = list(proba.values()))
        sentence.append(w[0])
    return sentence

In [50]:
test_data,test_vocab = load_vocab(test_set)

In [51]:
test_data[0]

['It',
 'is',
 'traditionally',
 'enjoyed',
 'alongside',
 'crispy',
 'rice',
 'cakes.']

In [52]:
test_data = remove_rare_words(valid_data, test_vocab, mincount = 1)

In [53]:
# GENERATE A SENTENCE FROM THE MODEL
model = build_ngram(test_data, 4)
print("Generated sentence: ",generate(model))

Generated sentence:  ['It', 'a', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']
