In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import re
from tqdm import  tqdm
import os
import collections
import gc
from nltk.probability import ConditionalFreqDist, ConditionalProbDist, MLEProbDist, FreqDist
import math

In [6]:
file_path = "/kaggle/input/amharic-corpus-general/GPAC.txt"
n = 4

In [57]:
os.path.exists(file_path)
if os.path.exists(file_path):
    print(os.stat(file_path).st_size / (1024 * 1024))
else:
    print(os.path.exists(file_path))

1013.849326133728


# 1) N-gram language model

## 1.1 Create n-grams for n=1, 2, 3, 4. You can show sample prints.

In [7]:
# Downloading necessary NLTK data
nltk.download('punkt')

def process_chunk(chunk, n):
    tokens = word_tokenize(re.sub(r'\W+', ' ', chunk))
    return list(ngrams(tokens, n))

def read_and_process_in_chunks(file_path, n, chunk_size=1024*1024):
    with open(file_path, 'r') as file:
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            return process_chunk(chunk, n)


all_ngrams = []
ngrams_chunk = read_and_process_in_chunks(file_path, n)
all_ngrams.extend(ngrams_chunk)
gc.collect()


[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


8

In [5]:
print(len(all_ngrams))

405


In [None]:
print(all_ngrams[:10])

## 1.2 Calculate probabilities of n-grams and find the top 10 most likely n-grams for all n.

In [8]:
# Create a conditional frequency distribution of Amharic n-grams
def create_freq_dist(ngrams):
    cfreq = ConditionalFreqDist((tuple(ngram[:-1]), ngram[-1]) for ngram in ngrams)
    return cfreq

# Create a conditional probability distribution using maximum likelihood estimation
def create_prob_dist(cfreq):
    cprob = ConditionalProbDist(cfreq, MLEProbDist)
    return cprob


probabilites = []

cfreq = create_freq_dist(all_ngrams)
cprob = create_prob_dist(cfreq)
probabilities = cprob
gc.collect()


0

In [64]:
# Get the top n-grams with the highest probabilities
def get_top_ngrams(cfreq, n, top_k):
    top_ngrams = []
    for context in cfreq.conditions():
        if len(context) == n - 1:
            freq_dist = cfreq[context]
            top_ngrams.extend([context,x,freq_dist.prob(x)] for x in freq_dist.samples())
    top_ngrams.sort(key = lambda x:x[2])
    return top_ngrams[len(top_ngrams)-10:]

In [65]:
top_ngrams = get_top_ngrams(probabilities, 4, 10)
for context, word, probability in top_ngrams:
    print(context,word,probability)

('ይሄው', 'ከሀገሪቱ', 'ብሔራዊ') ባንክ 1.0
('ከሀገሪቱ', 'ብሔራዊ', 'ባንክ') ያለ 1.0
('ብሔራዊ', 'ባንክ', 'ያለ') አዛዥ 1.0
('ባንክ', 'ያለ', 'አዛዥ') ናዛዥ 1.0
('ያለ', 'አዛዥ', 'ናዛዥ') የሚዘግኑት 1.0
('አዛዥ', 'ናዛዥ', 'የሚዘግኑት') ገንዘብ 1.0
('ናዛዥ', 'የሚዘግኑት', 'ገንዘብ') ነበር 1.0
('የሚዘግኑት', 'ገንዘብ', 'ነበር') ከባህላዊ 1.0
('ገንዘብ', 'ነበር', 'ከባህላዊ') ነገስታት 1.0
('ነበር', 'ከባህላዊ', 'ነገስታት') እስከ 1.0


## 1.3 What is the probability of the sentence. "ኢትዮጵያ ታሪካዊ ሀገር ናት ". You can also try more sentences.

In [49]:
sentence = ("ኢትዮጵያ" ,"ታሪካዊ" ,"ሀገር", "ናት")
# Calculate the probability of a specific four-gram
def calculate_fourgram_probability(cprob, fourgram):
    context = tuple(fourgram[:-1])
    word = fourgram[-1]
    probability = cprob[context].prob(word)
    return probability

probability = calculate_fourgram_probability(probabilities, sentence)
probability

0

## 1.4 Generate random sentences using n-grams; explain what happens as n increases, based on your output.

In [67]:
# Generate a sentence using the n-gram model
def generate_sentence(cprob, max_length=20):
    sentence = []
    context = cprob.conditions()[0]
    while len(sentence) < max_length:
        word = cprob[context].generate()
        sentence.append(word)
        context = context[1:] + (word,)
        if word == '።':
            break
    return ' '.join(sentence)

In [68]:
generated_sentence = generate_sentence(probabilities)
print(generated_sentence)

ኢትዮጵያ በተደጋጋሚ ጥሪው ደርሷት ልትታደመው ያልቻለችው የአለም የእግር ኳስ ዋ ለ19ኛ ጊዜ በደቡብ አፍሪካ ሲጠጣ በሩቅ እያየች አንጀቷ ባረረ ልክ


# 2 Evaluate these Language Models Using Intrinsic Evaluation Method

In [10]:
def calculate_perplexity(cpdist, test_set):
    log_probabilities = [-math.log(cpdist[seq[:-1]].prob(seq[-1],)) for seq in test_set]
    perplexity = math.exp(sum(log_probabilities) / len(log_probabilities))
    return perplexity

test_set = all_ngrams[:30]

perplexity = calculate_perplexity(probabilities, test_set)
print("Perplexity:", perplexity)


Perplexity: 1.0
