# Import Library

In [9]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import defaultdict
import string
import math
import numpy as np

In [10]:
with open('khmer_food.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()

In [11]:
# Separate the text corpus into 3 subsets: training (70%), validation (10%) and testing (20%)

def split_corpus(corpus):
    total_length = len(corpus)
    train_end = int(total_length * 0.7)
    val_end = int(total_length * 0.1)
    
    train_set = corpus[:train_end]
    val_set = corpus[train_end:train_end+val_end]
    test_set = corpus[train_end+val_end:]
    
    return train_set, val_set, test_set

In [12]:
train_set, val_set, test_set = split_corpus(corpus)

In [13]:
# Separate the text corpus into 3 subsets: training (70%), validation (10%) and testing (20%)

def split_corpus(corpus):
    total_length = len(corpus)
    train_end = int(total_length * 0.7)
    val_end = int(total_length * 0.1)
    
    train_set = corpus[:train_end]
    val_set = corpus[train_end:train_end+val_end]
    test_set = corpus[train_end+val_end:]
    
    return train_set, val_set, test_set

In [14]:
def get_prob(model, context, w):

    if context in model and w in model[context]:
        return model[context][w]
    else:
        return 0.4*get_prob(model, context[1:], w)

In [15]:
def get_proba_distrib(model, context):
    
    if context in model:
        return model[context]
    else:
        return get_proba_distrib(model, context[1:])

In [17]:
# Tokenization function
def tokenize_text(text, vocab_size):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in string.punctuation]
    unigrams = tokens
    bigrams = list(nltk.bigrams(tokens))
    return unigrams, bigrams

# Build n-gram model
def build_ngram(data, n):
    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for sentence in data:
        for i in range(len(sentence)):
            total_number_words += 1
            for k in range(n):
                if i - k < 0:
                    break
                counts[sentence[i-k:i]][sentence[i]] += 1

    proba = defaultdict(lambda: defaultdict(lambda: 0.0))
    for context in counts.keys():
        denom = sum(counts[context].values())
        for w in counts[context].keys():
            proba[context][w] = counts[context][w] / denom

    return proba

# Generate text
def generate(model):
    sentence = ["Khmer", "cuisine", "combines"]
    while sentence[-1] != "</s>" and len(sentence) < 100:
        proba = get_proba_distrib(model, tuple(sentence))
        w = np.random.choice(list(proba.keys()), 1, p=list(proba.values()))
        sentence.append(w[0])
    return sentence

# Perplexity calculation
def perplexity(model, data, n):
    perp, T = 0.0, 0
    for sentence in data:
        for i in range(1, len(sentence)):
            nc = min(n-1, i)
            context = sentence[i-nc:i]
            perp += -math.log(get_prob(model, context, sentence[i]))
            T += 1
    perp = math.exp(perp/T)
    return perp

# Test data
vocab_size = 100
test_unigrams, test_bigrams = tokenize_text(test_set, vocab_size)

# Build n-gram model for bigrams
n = 4
model = build_ngram(test_bigrams, n)

# Generate a sentence
print("Generated sentence:", generate(model))

# Evaluate perplexity
perplexity_score = perplexity(model, test_bigrams, n)
print("Perplexity:", perplexity_score)


Generated sentence: ['Khmer', 'cuisine', 'combines', 'and', 'the', 'typical', 'beverages', 'including', 'the', 'region', 'being', 'physically', 'descriptive', 'english', 'it', 'is', 'a', 'dipping', 'relish', 'called', 'o', 'mai', 'and', 'mussels', 'are', 'descendants', 'reside', 'in', 'the', 'elimination', 'challenge', 'for', 'shop', 'a', 'delicacy', 'dish', 'for', 'the', 'nguyen', "'s", 'cuisine', 'is', 'especially', 'with', 'a', 'clay', 'pot', 'sour', 'sweet', 'soup', 'called', 'pansoh', 'or', 'curried', 'dishes', 'seafood', 'a', 'single', 'time', 'before', 'a', 'malay/hokkien', 'term', 'for', 'certain', 'towns', 'to', 'malaysia', 'and', 'served', 'in', 'the', 'normal', 'price', 'vegetarian', 'or', 'sour', 'or', 'northern', 'and', 'is', 'used', 'only', 'by', 'starting', 'with', 'silkworms', 'and', 'having', 'interesting', 'textures', 'and', 'dinner', 'including', 'the', 'bamboo', 'tubes', 'and', 'neighbouring', 'singapore']
Perplexity: 9.070901165622379
