In [31]:
import nltk
from nltk.corpus import brown
from nltk.util import ngrams
from collections import Counter, defaultdict
import pandas as pd
import string

# Download the corpus
nltk.download('brown')

# Load text, remove punctuation, lowercase, and exclude empty strings
tokenized_text = [
    word.lower() 
    for word in brown.words() 
    if word not in string.punctuation and word.strip()
]


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [32]:
def create_ngrams_df(text, n):
    
    # Generate n-grams
    ngrams = list(nltk.ngrams(text, n))
    
    # Count occurrences of each n-gram
    ngram_counts = Counter(ngrams)
    
    # Create a DataFrame
    df = pd.DataFrame.from_dict(ngram_counts, orient='index').reset_index()
    df = df.rename(columns={'index': 'ngram', 0: 'count'})
    df = df.sort_values('count', ascending=False).reset_index(drop=True)
    
    return df

In [33]:
def predict_next_words(text, sequence, k=5, n=4):
    # Convert string input to list
    if isinstance(sequence, str):
        sequence = [sequence.lower()]
    
    # Try from highest to lowest n-gram order (trigram -> bigram -> unigram)
    for n in range(min(n, len(sequence) + 1), 1, -1):
        context = sequence[-(n-1):]  # Get last (n-1) words for context
        
        # Generate n-grams and filter matches
        ngrams_df = create_ngrams_df(text, n)
        matches = ngrams_df[ngrams_df['ngram'].apply(lambda x: x[:-1] == tuple(context))].copy()
        
        if not matches.empty:
            total = matches['count'].sum()
            matches['probability'] = matches['count'] / total
            top_k = matches.head(k)
            return [(ngram[-1], count, prob) 
                    for ngram, count, prob in 
                    zip(top_k['ngram'], top_k['count'], top_k['probability'])]
    
    return []  # No matches found

In [34]:
# Create a bigrams dataframe (n=2)
bigrams_df = create_ngrams_df(tokenized_text, n=2)
print("DF CREATED HEAD: \n", bigrams_df.head())

# Predict next words after "the"
predictions = predict_next_words(tokenized_text, "the", k=5, n=2)
print("\nTop 3 predictions after 'the':")
for word, count, prob in predictions:
    print(f"{word}: {count} occurrences, probability={prob:.2f}")

# Predict with trigrams (n=3) after a 2-word sequence
predictions = predict_next_words(tokenized_text, ["in", "the"], k=5, n=2)
print("\nTop 3 predictions after 'in the':")
for word, count, prob in predictions:
    print(f"{word}: {count} occurrences, probability={prob:.2f}")

DF CREATED HEAD: 
         ngram  count
0   (of, the)   9721
1   (in, the)   6041
2   (to, the)   3492
3   (on, the)   2477
4  (and, the)   2247

Top 3 predictions after 'the':
first: 662 occurrences, probability=0.01
same: 628 occurrences, probability=0.01
most: 417 occurrences, probability=0.01
other: 416 occurrences, probability=0.01
``: 405 occurrences, probability=0.01

Top 3 predictions after 'in the':
first: 662 occurrences, probability=0.01
same: 628 occurrences, probability=0.01
most: 417 occurrences, probability=0.01
other: 416 occurrences, probability=0.01
``: 405 occurrences, probability=0.01
