In [36]:
import nltk
import random
import re
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


In [37]:

# Step 1: Read corpus from txt file
def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        corpus = file.read()
    return corpus


In [38]:
# Step 2: Split corpus into training, validation, and testing subsets

def split_corpus(corpus):
    sentences = nltk.sent_tokenize(corpus)
    total_length = len(sentences)
    train_end = int(total_length * 0.7)
    val_end = int(total_length * 0.1)
    
    train_set = sentences[:train_end]
    val_set = sentences[train_end:train_end+val_end]
    test_set = sentences[train_end+val_end:]
    
    return train_set, val_set, test_set

In [39]:

# Step 3: Preprocess corpus
def preprocess_corpus(corpus):
    # Remove special characters, keep only full stops and remove stop words
    processed_corpus = re.sub(r'[^\w\s.]', '', corpus)
    processed_corpus = re.sub(r'\b\w{1,3}\b', '', processed_corpus)
    return processed_corpus


In [40]:

# Step 4: Tokenize corpus and limit vocabulary size
def tokenize_corpus(corpus, vocab_size):
    tokens = word_tokenize(corpus)
    token_counts = Counter(tokens)
    vocab = {token for token, count in token_counts.most_common(vocab_size)}
    tokenized_corpus = [token if token in vocab else '<UNK>' for token in tokens]
    return tokenized_corpus


In [41]:

# Step 5: Build 4-gram language model with add-k smoothing and LM2 interpolation
def build_language_model(tokens, k, vocab_size):
    ngrams = nltk.ngrams(tokens, 4, pad_left=True, pad_right=True)
    bigrams = nltk.ngrams(tokens, 2, pad_left=True, pad_right=True)
    trigrams = nltk.ngrams(tokens, 3, pad_left=True, pad_right=True)
    unigrams = tokens

    bigram_counts = Counter(bigrams)
    trigram_counts = Counter(trigrams)
    ngram_counts = Counter(ngrams)
    unigram_counts = Counter(unigrams)

    def calculate_probability(wi_minus_2, wi_minus_1, wi):
        lambda1 = 1/4  # Including unigram, so update lambda accordingly
        lambda2 = 1/4
        lambda3 = 1/4
        lambda4 = 1/4  # New lambda for unigram

        prob_4gram = (ngram_counts[(wi_minus_2, wi_minus_1, wi, wi)] + k) / \
                      (trigram_counts[(wi_minus_2, wi_minus_1, wi)] + k * vocab_size)

        prob_3gram = (trigram_counts[(wi_minus_1, wi, wi)] + k) / \
                      (bigram_counts[(wi_minus_1, wi)] + k * vocab_size)

        prob_2gram = (bigram_counts[(wi, wi)] + k) / \
                      (unigram_counts[wi] + k * vocab_size)

        prob_1gram = (unigram_counts[wi] + k) / \
                      (len(tokens) + k * vocab_size)

        return lambda1 * prob_4gram + lambda2 * prob_3gram + lambda3 * prob_2gram + lambda4 * prob_1gram

    return calculate_probability



In [42]:

# Step 6: Evaluate models on test set using perplexity
def evaluate_model(model, test_tokens):
    total_log_prob = 0
    N = len(test_tokens)
    for i in range(2, N-2):
        wi_minus_2, wi_minus_1, wi, wi_plus_1 = test_tokens[i-2:i+2]
        prob = model(wi_minus_2, wi_minus_1, wi)
        total_log_prob += -1 * (prob * (N-4))
    perplexity = 2 ** (total_log_prob / N)
    return perplexity
    


In [43]:
# Step 7: Create a text generator using the model
def generate_text(model, seed_text, length, vocab):
    generated_text = seed_text
    seed_tokens = word_tokenize(seed_text)
    for i in range(length):
        wi_minus_2, wi_minus_1 = seed_tokens[-2:]
        next_token_probabilities = {}
        for token in vocab:
            next_token_probabilities[token] = model(wi_minus_2, wi_minus_1, token)
        
        # Apply nucleus sampling
        sorted_tokens = sorted(next_token_probabilities.keys(), key=lambda x: next_token_probabilities[x], reverse=True)
        sorted_probs = [next_token_probabilities[token] for token in sorted_tokens]
        sorted_cum_probs = np.cumsum(sorted_probs)
        sorted_cum_probs /= sorted_cum_probs[-1]
        
        # Choose next token using nucleus sampling
        sampled_token_index = np.argmax(sorted_cum_probs > np.random.rand())
        next_token = sorted_tokens[sampled_token_index]

        generated_text += ' ' + next_token
        seed_tokens.append(next_token)
        if next_token == '.':
            break
    return generated_text

In [44]:

# Read corpus
corpus = read_corpus('khmer_food.txt')


In [45]:
# Split corpus
train_set, val_set, test_set = split_corpus(corpus)


In [46]:
# Preprocess corpus
processed_train_set = preprocess_corpus(' '.join(train_set))
processed_val_set = preprocess_corpus(' '.join(val_set))
processed_test_set = preprocess_corpus(' '.join(test_set))


In [47]:
# Tokenize corpus
vocab_size = 5000
train_tokens = tokenize_corpus(processed_train_set, vocab_size)
val_tokens = tokenize_corpus(processed_val_set, vocab_size)
test_tokens = tokenize_corpus(processed_test_set, vocab_size)

In [48]:

# Build language model
k_values = [0.1, 0.01, 0.001]
lambdas = [(0.1, 0.3, 0.6), (0.2, 0.4, 0.4), (0.3, 0.5, 0.2)]  # example lambdas
best_perplexity = float('inf')
best_k = None
best_lambdas = None

In [49]:
# Evaluate the model on the validation set
for k in k_values:
    for lambd in lambdas:  # Use a different variable name here
        model = build_language_model(train_tokens, k, vocab_size)
        perplexity = evaluate_model(model, val_tokens)
        if perplexity < best_perplexity:
            best_perplexity = perplexity
            best_k = k
            best_lambdas = lambd  # Update the variable name here


print("Best k:", best_k)
print("Best lambdas:", best_lambdas)

Best k: 0.001
Best lambdas: (0.1, 0.3, 0.6)


In [50]:
# Evaluate model on test set
best_model = build_language_model(train_tokens, best_k, vocab_size)
test_perplexity = evaluate_model(best_model, test_tokens)
print("Test set perplexity:", test_perplexity)

Test set perplexity: 0.007092211142352505


In [51]:
best_model

<function __main__.build_language_model.<locals>.calculate_probability(wi_minus_2, wi_minus_1, wi)>

In [52]:
# Usage example:
seed_text = "This is khmer"
vocab =set(train_tokens)
generated_text = generate_text(best_model, seed_text, 100, vocab)
print("Generated text:", generated_text)

Generated text: This is khmer burned estimated have curry monay kroeung Valencia Shaoxing Pheak Indian Ilocos juice crickets papaya Sorey prahok Cabagan juice liver pearls pork prahok dian <UNK> among about kaeng kamatis more .
