# Next Word Prediction using NLTK

#### Importing Libraries

In [1]:
%%capture
!pip install nltk

In [2]:
import nltk
from nltk.corpus import reuters
from nltk import bigrams, ConditionalFreqDist

In [3]:
nltk.download(['reuters', 'punkt'])

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Loading the dataset

In [4]:
corpus = reuters.sents()

In [5]:
print(corpus[1])

['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.']


In [6]:
len(corpus)

54716

#### Creating Bigrams

In [7]:
words = [word.lower() for sent in corpus for word in sent]
bigrams_list = list(bigrams(words))

In [8]:
print(bigrams_list[:10])

[('asian', 'exporters'), ('exporters', 'fear'), ('fear', 'damage'), ('damage', 'from'), ('from', 'u'), ('u', '.'), ('.', 's'), ('s', '.-'), ('.-', 'japan'), ('japan', 'rift')]


#### Creating Conditional Frequency Distribution

In [9]:
cfd = ConditionalFreqDist(bigrams_list)

In [10]:
cfd['the']

FreqDist({'company': 3126, 'u': 2264, 'dollar': 984, 'bank': 960, 'first': 839, 'government': 787, 'year': 720, 'united': 682, 'new': 678, 'market': 590, ...})

#### Predicting Next Word

In [11]:
def predict_next_word(input_word):
    input_word = input_word.lower()

    if input_word in cfd:
        return cfd[input_word].max()
    else:
        return "Word not found in corpus"

In [12]:
input_word = str(input('Enter word: '))
next_word = predict_next_word(input_word)
print(f"The next word after '{input_word}' could be: '{next_word}'")

Enter word: I
The next word after 'I' could be: 'think'
