# LM1

## 1. Import Libraries

In [11]:
import nltk
import random
import re
from nltk.tokenize import word_tokenize,sent_tokenize
import math
import numpy as np
from collections import defaultdict, Counter

In [12]:
# Step 1: Read corpus from txt file
def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        corpus = file.read()
    return str(corpus)  

In [13]:
# Step 2: Split corpus into training, validation, and testing subsets
def split_corpus(corpus):
    sentences = nltk.sent_tokenize(corpus)
    random.shuffle(sentences)

    total_sentences = len(sentences)
    train_end = int(0.7 * total_sentences)
    val_end = int(0.1 * total_sentences) + train_end
    train_set = sentences[:train_end]
    val_set = sentences[train_end:val_end]
    test_set = sentences[val_end:]
    
    return train_set, val_set, test_set


In [14]:

# Step 3: Preprocess corpus
def preprocess_corpus(corpus):
    # Remove special characters, keep only full stops and remove stop words
    processed_corpus = re.sub(r'[^\w\s.]', '', corpus)
    processed_corpus = re.sub(r'\b\w{1,3}\b', '', processed_corpus)
    return processed_corpus

In [15]:

# Step 4: Tokenize corpus and limit vocabulary size

def tokenize_corpus(corpus, vocab_size):
    tokens = word_tokenize(corpus)
    token_counts = Counter(tokens)
    vocab = {token for token, count in token_counts.most_common(vocab_size)}
    tokenized_corpus = [token if token in vocab else '<UNK>' for token in tokens]
    return tokenized_corpus

In [16]:
# Step 5: Build LM1 language model using backoff method without add-k smoothing
def build_language_model(tokens):
    ngrams = nltk.ngrams(tokens, 3, pad_left=True, pad_right=True)
    bigrams = nltk.ngrams(tokens, 2, pad_left=True, pad_right=True)
    unigrams = tokens

    bigram_counts = Counter(bigrams)
    trigram_counts = Counter(ngrams)
    unigram_counts = Counter(unigrams)

    def calculate_probability(wi_minus_1, wi):
        lambda1 = 1/2  # Update lambdas for LM1
        lambda2 = 1/2

        # Calculate probabilities for 3-gram, 2-gram, and 1-gram without smoothing
        prob_3gram = trigram_counts[(wi_minus_1, wi)] / bigram_counts[wi_minus_1] if bigram_counts[wi_minus_1] > 0 else 0
        prob_2gram = bigram_counts[wi] / unigram_counts[wi_minus_1] if unigram_counts[wi_minus_1] > 0 else 0
        prob_1gram = unigram_counts[wi] / len(tokens)

        # Backoff: Use lower-order n-gram probabilities when higher-order n-grams are not available
        if prob_3gram == 0:
            prob_3gram = prob_2gram if prob_2gram != 0 else prob_1gram
        if prob_2gram == 0:
            prob_2gram = prob_1gram

        return lambda1 * prob_3gram + lambda2 * prob_2gram

    # Return a dictionary-like object representing the language model
    return calculate_probability



In [17]:


# Step 6: Evaluate LM1 language model on the test set
def evaluate_model(model, test_tokens):
    total_log_prob = 0
    N = len(test_tokens)
    for i in range(2, N-2):
        wi_minus_1, wi = test_tokens[i-1:i+1]  # Adjust the indices
        log_prob = model(wi_minus_1, wi)  # Ensure the model returns log probabilities
        total_log_prob += log_prob  # Accumulate log probabilities
    avg_log_prob = total_log_prob / (N - 4)  # Adjusted for range and excluding padding tokens
    perplexity = 2 ** -avg_log_prob  # Compute perplexity
    return perplexity



In [18]:

# Read corpus
corpus = read_corpus('khmer_food.txt')

# Split corpus
train_set, val_set, test_set = split_corpus(corpus)

# Preprocess corpus
processed_train_set = preprocess_corpus(' '.join(train_set))
processed_val_set = preprocess_corpus(' '.join(val_set))
processed_test_set = preprocess_corpus(' '.join(test_set))

# Tokenize corpus
vocab_size = 5000
train_tokens = tokenize_corpus(processed_train_set, vocab_size)
test_tokens = tokenize_corpus(processed_test_set, vocab_size)


# Build LM1 language model using backoff method without add-k smoothing
lm1_model = build_language_model(train_tokens)

# Evaluate LM1 language model on the test set
perplexity = evaluate_model(lm1_model, test_tokens)
print("Test set perplexity:", perplexity)


Test set perplexity: 0.9939410704789275


In [19]:
# Step 7: Create a text generator using the model
def generate_text(model, seed_text, length, vocab):
    generated_text = seed_text
    seed_tokens = word_tokenize(seed_text)
    for i in range(length):
        wi_minus_1 = seed_tokens[-1]  # Adjusted to get the last token
        next_token_probabilities = {}
        for token in vocab:
            next_token_probabilities[token] = model(wi_minus_1, token)  # Adjusted to pass only two arguments
        
        # Apply nucleus sampling
        sorted_tokens = sorted(next_token_probabilities.keys(), key=lambda x: next_token_probabilities[x], reverse=True)
        sorted_probs = [next_token_probabilities[token] for token in sorted_tokens]
        sorted_cum_probs = np.cumsum(sorted_probs)

        sorted_cum_probs /= sorted_cum_probs[-1]
        
        # Choose next token using nucleus sampling
        sampled_token_index = np.argmax(sorted_cum_probs > np.random.rand())
        next_token = sorted_tokens[sampled_token_index]

        generated_text += ' ' + next_token
        seed_tokens.append(next_token)
        if next_token == '.':
            break
    return generated_text


In [20]:

# Sample usage
# Usage example:
seed_text = "This is khmer"
vocab = set(train_tokens)
generated_text = generate_text(lm1_model, seed_text, 100,vocab)
print("Generated text:", generated_text)

Generated text: This is khmer with coconut cakes dish traditionally sugar unique while unique occasions sticky like ProrHal glutinous companion there tamarind dessert lots Cake samlor .
