In [1]:
import nltk
from nltk.corpus import brown
from nltk.util import ngrams
from collections import defaultdict, Counter

In [2]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\gkeer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [3]:
def preprocess_data():
    # Load and preprocess the data
    sentences = brown.sents(categories='news')
    words = [word.lower() for sent in sentences for word in sent]
    
    # Create N-grams
    unigrams = ngrams(words, 1)
    bigrams = ngrams(words, 2)
    trigrams = ngrams(words, 3)
    
    # Count the occurrences of N-grams
    unigram_counts = Counter(unigrams)
    bigram_counts = Counter(bigrams)
    trigram_counts = Counter(trigrams)
    
    return unigram_counts, bigram_counts, trigram_counts

In [4]:
def predict_next_word(previous_words, unigram_counts, bigram_counts, trigram_counts):
    # Get the previous N-1 words
    n = len(previous_words)
    if n == 0:
        return None
    
    # Check if the trigram exists in the corpus
    if n == 2 and tuple(previous_words) in trigram_counts:
        possible_next_words = trigram_counts[tuple(previous_words)]
    # Check if the bigram exists in the corpus
    elif n == 1 and tuple(previous_words) in bigram_counts:
        possible_next_words = bigram_counts[tuple(previous_words)]
    # Otherwise, default to unigram
    else:
        possible_next_words = unigram_counts[previous_words[0]]
    
    # Get the most likely next word
    if isinstance(possible_next_words, int):
        next_word = possible_next_words  # If only a single word count is present
    else:
        next_word, _ = possible_next_words.most_common(1)[0]
    return next_word


In [5]:
def main():
    unigram_counts, bigram_counts, trigram_counts = preprocess_data()
    
    # Test the next word prediction
    previous_words = ['this', 'is']
    next_word = predict_next_word(previous_words, unigram_counts, bigram_counts, trigram_counts)
    if next_word:
        print(f"The most likely next word after '{' '.join(previous_words)}' is: {next_word}")
    else:
        print(f"Could not predict the next word after '{' '.join(previous_words)}'")

In [6]:
if __name__ == "__main__":
    main()

Could not predict the next word after 'this is'


In [7]:
import nltk
import random

# Sample text data
text_data = "Hello how are you? I am fine thank you."

# Tokenize the text
tokens = nltk.word_tokenize(text_data)

# Create a dictionary to store transitions
transitions = {}
for i in range(len(tokens) - 1):
    current_word = tokens[i]
    next_word = tokens[i + 1]
    if current_word in transitions:
        transitions[current_word].append(next_word)
    else:
        transitions[current_word] = [next_word]

# Generate next word based on Markov chain
def generate_next_word(seed_word):
    if seed_word in transitions:
        possible_next_words = transitions[seed_word]
        return random.choice(possible_next_words)
    else:
        return None

# Generate predictions
seed_word = "Hello"
next_word = generate_next_word(seed_word)
if next_word:
    print(f"The next word after '{seed_word}' could be: {next_word}")
else:
    print(f"No prediction available for '{seed_word}'")


The next word after 'Hello' could be: how


In [15]:
import nltk
from nltk.corpus import brown
from nltk.util import ngrams
from collections import Counter

# Download the brown corpus
nltk.download('brown')

def preprocess_data():
    # Load and preprocess the data
    sentences = brown.sents(categories='news')
    words = [word.lower() for sent in sentences for word in sent]
    
    # Create N-grams
    unigrams = ngrams(words, 1)
    bigrams = ngrams(words, 2)
    
    # Count the occurrences of N-grams
    unigram_counts = Counter(unigrams)
    bigram_counts = Counter(bigrams)
    
    return unigram_counts, bigram_counts

def predict_next_word(previous_word, unigram_counts, bigram_counts):
    # Check if the bigram exists in the corpus
    if (previous_word,) in bigram_counts:
        possible_next_words = bigram_counts[(previous_word,)]
    # Otherwise, default to unigram
    else:
        possible_next_words = unigram_counts[previous_word]
    
    # Get the most likely next word
    if isinstance(possible_next_words, int):
        next_word = None  # If only a single word count is present
    else:
        next_word, _ = possible_next_words.most_common(1)[0]
    return next_word

def main():
    unigram_counts, bigram_counts = preprocess_data()
    
    # User input
    user_input = input("Enter a sequence of words (space-separated): ").strip().lower()
    input_words = user_input.split()
    
    if len(input_words) == 0:
        print("Please provide at least one word.")
        return
    
    # Get the last word from the user input
    previous_word = input_words[-1]
    
    # Predict the next word
    next_word = predict_next_word(previous_word, unigram_counts, bigram_counts)
    
    # Display prediction
    if next_word is not None:
        print(f"The most likely next word after '{previous_word}' is: {next_word}")
    else:
        print(f"Could not predict the next word after '{previous_word}'. The input sequence may not be present in the training data.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\gkeer\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


Enter a sequence of words (space-separated): name
Could not predict the next word after 'name'. The input sequence may not be present in the training data.
