##Importing necessary libraries

In [None]:
from nltk.corpus import brown
from nltk.tokenize import word_tokenize

###Needed to download the corpus zip as was not already available

In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

##Processing the corpus after initialization

In [None]:
#Loading corpus to have collection of words
corpus = brown.words()

#Lower casing the corpus words for uniformity and using set to get the unique words in vocabulary
lower_case_corpus = [w.lower() for w in corpus]
vocab = set(lower_case_corpus)

print(lower_case_corpus[:30])
print(list(vocab)[:30])

['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.', 'the', 'jury', 'further', 'said', 'in']
['wholeness', "admassy's", '1610', 'baltimorean', 'avidly', 'elegies', 'inveterate', 'superstructure', 'expectancy', 'snakestrike', 'apportionments', '$31,179,816', 'privileges', 'roslev', 'condensation', 'self-rule', 'hitherto', 'ourselves', "7''", 'legislate', 'modern-dance', 'bartha', 'statuto', 'television-electronics', 'tracers', 'harrington', 'locked', 'earrings', 'focally', 'sepia']


##Printing the total size of the corpus and the vocaulary obtained from the corpus

In [None]:
print(f"Total words in corpus: {(len(lower_case_corpus))}")
print(f"Total vocab size: {len(vocab)}")

Total words in corpus: 1161192
Total vocab size: 49815


##Setting the value of n for the n grams language modelling

In [None]:
#1 for unigram, 2 for bigram & 3 for trigram
n=3

###Populating the ngrams_counts set according to the n value to get frequency if each set of n words


In [None]:
ngram_counts = {}
n_minus1_gram_counts = {}

for i in range(len(lower_case_corpus) - n + 1):
    ngram = tuple(lower_case_corpus[i:i+n])
    n_minus1_gram = ngram[:-1]

    if ngram in ngram_counts:
        ngram_counts[ngram] += 1
    else:
        ngram_counts[ngram] = 1

    if n_minus1_gram in n_minus1_gram_counts:
        n_minus1_gram_counts[n_minus1_gram] += 1
    else:
        n_minus1_gram_counts[n_minus1_gram] = 1

print(list(ngram_counts)[:20])
print(list(n_minus1_gram_counts)[:20])


[('the', 'fulton', 'county'), ('fulton', 'county', 'grand'), ('county', 'grand', 'jury'), ('grand', 'jury', 'said'), ('jury', 'said', 'friday'), ('said', 'friday', 'an'), ('friday', 'an', 'investigation'), ('an', 'investigation', 'of'), ('investigation', 'of', "atlanta's"), ('of', "atlanta's", 'recent'), ("atlanta's", 'recent', 'primary'), ('recent', 'primary', 'election'), ('primary', 'election', 'produced'), ('election', 'produced', '``'), ('produced', '``', 'no'), ('``', 'no', 'evidence'), ('no', 'evidence', "''"), ('evidence', "''", 'that'), ("''", 'that', 'any'), ('that', 'any', 'irregularities')]
[('the', 'fulton'), ('fulton', 'county'), ('county', 'grand'), ('grand', 'jury'), ('jury', 'said'), ('said', 'friday'), ('friday', 'an'), ('an', 'investigation'), ('investigation', 'of'), ('of', "atlanta's"), ("atlanta's", 'recent'), ('recent', 'primary'), ('primary', 'election'), ('election', 'produced'), ('produced', '``'), ('``', 'no'), ('no', 'evidence'), ('evidence', "''"), ("''", '

###Prediction suggestions of the next words according to user inputs

In [None]:
def suggest_next_word(input_, ngram_counts, n_minus1_gram_counts, vocab):
    # Consider the last n-1 words of the input as the context
    tokenized_input = word_tokenize(input_.lower())
    last_ngram = tokenized_input[-n+1:]

    # Calculating probability for each word in vocab
    vocab_probabilities = {}
    for vocab_word in vocab:
        test_ngram = tuple(last_ngram + [vocab_word])

        test_ngram_count = ngram_counts.get(test_ngram, 0)
        n_minus1_gram_count = n_minus1_gram_counts.get(tuple(last_ngram), 0)

        if n_minus1_gram_count != 0:
            probability = test_ngram_count / n_minus1_gram_count
        else:
            probability = 0

        vocab_probabilities[vocab_word] = probability

    # Sorting the vocab probability in descending order to get top probable words
    top_suggestions = sorted(vocab_probabilities.items(), key=lambda x: x[1], reverse=True)[:3]
    return top_suggestions


###Testing working of function

In [None]:
suggest_next_word('I am the king',ngram_counts,n_minus1_gram_counts,vocab)

[('james', 0.17647058823529413),
 ('of', 0.1568627450980392),
 ('arthur', 0.11764705882352941)]

In [None]:
suggest_next_word('The fulton county',ngram_counts,n_minus1_gram_counts,vocab)

[('grand', 0.16666666666666666),
 ('jail', 0.16666666666666666),
 ('general', 0.16666666666666666)]

In [None]:
suggest_next_word('I am the king of',ngram_counts,n_minus1_gram_counts,vocab)

[('france', 0.3333333333333333),
 ('hearts', 0.16666666666666666),
 ('orators', 0.08333333333333333)]