In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from nltk.util import ngrams

In [4]:
import nltk
from nltk.tokenize import word_tokenize

In [16]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\NGARUIYA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


True

In [5]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [6]:
corpus1 = nltk.corpus.gutenberg.sents('austen-emma.txt')

In [7]:
corpus2 = nltk.corpus.gutenberg.sents('carroll-alice.txt')

In [8]:
import string

In [9]:
def clean_corpus(corpus):
    clean = []
    for token_sent in corpus:
        x = [''.join(c for c in s if c not in string.punctuation) for s in token_sent]
        x = [s for s in x if s]
        clean.append(x)
    return clean

In [10]:
corpus1_clean = clean_corpus(corpus1)
corpus2_clean = clean_corpus(corpus2)

In [11]:
def get_n_gram(corpus, n):
    gram = []
    for token_sent in corpus:
        gram.extend(list(ngrams(token_sent, n)))
    return gram

In [20]:
Freq_1= nltk.FreqDist(get_n_gram(corpus1_clean, 1))
Freq_2 = nltk.FreqDist(get_n_gram(corpus2_clean, 1))
print ("Most common unigrams: ", Freq_1.most_common(10))
print ("Most common unigrams: ", Freq_2.most_common(10))

Most common unigrams:  [(('to',), 5186), (('the',), 4846), (('and',), 4673), (('of',), 4281), (('I',), 3192), (('a',), 3005), (('her',), 2400), (('was',), 2387), (('it',), 2129), (('in',), 2118)]
Most common unigrams:  [(('the',), 1527), (('and',), 802), (('to',), 725), (('a',), 615), (('I',), 545), (('it',), 527), (('she',), 509), (('of',), 500), (('said',), 456), (('Alice',), 396)]


In [12]:
freq1 = nltk.FreqDist(get_n_gram(corpus1_clean, 2))

In [13]:
freq2 = nltk.FreqDist(get_n_gram(corpus2_clean, 2))

In [21]:
print ("Most common bigrams: ", freq1.most_common(10))
print ("Most common bigrams: ", freq2.most_common(10))

Most common bigrams:  [(('to', 'be'), 596), (('of', 'the'), 557), (('in', 'the'), 434), (('I', 'am'), 395), (('had', 'been'), 308), (('it', 'was'), 289), (('I', 'have'), 283), (('could', 'not'), 277), (('Mr', 'Knightley'), 277), (('of', 'her'), 265)]
Most common bigrams:  [(('said', 'the'), 207), (('of', 'the'), 130), (('said', 'Alice'), 116), (('in', 'a'), 95), (('in', 'the'), 77), (('and', 'the'), 75), (('to', 'the'), 69), (('the', 'Queen'), 62), (('at', 'the'), 60), (('it', 'was'), 60)]


In [22]:
freq_1 = nltk.FreqDist(get_n_gram(corpus1_clean, 3))
freq_2 = nltk.FreqDist(get_n_gram(corpus2_clean, 3))
print ("Most common trigrams: ", freq_1.most_common(10))
print ("Most common trigrams: ", freq_2.most_common(10))

Most common trigrams:  [(('I', 'do', 'not'), 136), (('I', 'am', 'sure'), 109), (('a', 'great', 'deal'), 63), (('would', 'have', 'been'), 60), (('do', 'not', 'know'), 55), (('she', 'could', 'not'), 52), (('I', 'dare', 'say'), 50), (('in', 'the', 'world'), 49), (('Mr', 'Frank', 'Churchill'), 49), (('I', 'assure', 'you'), 47)]
Most common trigrams:  [(('the', 'Mock', 'Turtle'), 49), (('I', 'don', 't'), 30), (('the', 'March', 'Hare'), 29), (('said', 'the', 'King'), 29), (('said', 'the', 'Hatter'), 21), (('the', 'White', 'Rabbit'), 20), (('said', 'to', 'herself'), 19), (('said', 'the', 'Mock'), 19), (('said', 'the', 'Caterpillar'), 18), (('she', 'said', 'to'), 17)]


In [15]:
def calculate_probs(n, corpus):
    ngrams_all = { n:[]} 
    for List in corpus:
        for j in ngrams(List, n):
            ngrams_all[n].append(j)
    ngrams_voc = {n:set([])}
    for gram in ngrams_all[n]:
        if gram not in ngrams_voc[n]:
                ngrams_voc[n].add(gram)
    total_ngrams = {1:-1, 2:-1, 3:-1, 4:-1}
    total_voc = {1:-1, 2:-1, 3:-1, 4:-1}
    total_ngrams[n] = len(ngrams_all[n])
    total_voc[n] = len(ngrams_voc[n])
    ngrams_prob = { n:[]}
    for ngram in ngrams_voc[n]:
        tlist = [ngram]
        tlist.append(ngrams_all[n].count(ngram))
        ngrams_prob[n].append(tlist) 
    for ngram in ngrams_prob[n]:
        ngram[-1] = (ngram[-1]+1)/(total_ngrams[2] + total_voc[2])
    ngrams_prob[n] = sorted(ngrams_prob[n], key = lambda x:x[1], reverse = True)
    return ngrams_prob[n]

In [16]:
prob_c1 = calculate_probs(2,corpus1_clean)

In [18]:
prob_c1

[[('to', 'be'), 0.002738695426791506],
 [('of', 'the'), 0.0025597856752925634],
 [('in', 'the'), 0.001995531843642052],
 [('I', 'am'), 0.0018166220921431095],
 [('had', 'been'), 0.001417515723414699],
 [('it', 'was'), 0.0013303545624280347],
 [('I', 'have'), 0.0013028299852743512],
 [('could', 'not'), 0.0012753054081206677],
 [('Mr', 'Knightley'), 0.0012753054081206677],
 [('of', 'her'), 0.0012202562538133007],
 [('she', 'had'), 0.0011789693880827755],
 [('Mrs', 'Weston'), 0.0011468573814034783],
 [('have', 'been'), 0.0011193328042497948],
 [('to', 'the'), 0.0010872207975704973],
 [('she', 'was'), 0.001009234495635061],
 [('and', 'the'), 0.001004647066109447],
 [('would', 'be'), 0.001000059636583833],
 [('Mr', 'Elton'), 0.0009862973480069912],
 [('do', 'not'), 0.0009404230527508521],
 [('it', 'is'), 0.0009037236165459408],
 [('of', 'his'), 0.0008807864689178713],
 [('a', 'very'), 0.0008761990393922573],
 [('to', 'her'), 0.0008716116098666434],
 [('that', 'she'), 0.0008440870327129599],

In [17]:
prob_c2 = calculate_probs(2,corpus2_clean)

In [19]:
prob_c2

[[('said', 'the'), 0.00525305586422871],
 [('of', 'the'), 0.003308414991413274],
 [('said', 'Alice'), 0.0029548439236286492],
 [('in', 'a'), 0.002424487321951712],
 [('in', 'the'), 0.0019698959490857663],
 [('and', 'the'), 0.0019193857965451055],
 [('to', 'the'), 0.0017678553389231235],
 [('the', 'Queen'), 0.0015910698050308112],
 [('at', 'the'), 0.0015405596524901506],
 [('it', 'was'), 0.0015405596524901506],
 [('I', 'm'), 0.0014647944236791594],
 [('as', 'she'), 0.0014395393474088292],
 [('a', 'little'), 0.0014395393474088292],
 [('the', 'King'), 0.0014395393474088292],
 [('Mock', 'Turtle'), 0.0014395393474088292],
 [('she', 'had'), 0.0014142842711384988],
 [('to', 'be'), 0.0013132639660571775],
 [('the', 'Gryphon'), 0.0013132639660571775],
 [('don', 't'), 0.0013132639660571775],
 [('she', 'was'), 0.0012627538135165169],
 [('the', 'Mock'), 0.0012627538135165169],
 [('went', 'on'), 0.0012374987372461865],
 [('the', 'Hatter'), 0.001186988584705526],
 [('and', 'she'), 0.0011617335084351

In [1]:
def predict_next(string, n,corpus, prob):
    token = word_tokenize(string) 
    gram = list(ngrams(token, n))[-1]
    probabilities =  prob
    count = 0
    prediction = {n:[]}
    for each in probabilities:
        if each[0][:-1] == gram:
            count +=1
            prediction[n].append(each[0][-1])
            if count ==1:
                break
        if count<5:
            while(count!=1):
                prediction[n].append("NOT FOUND") 
    return prediction

In [None]:
print("The most likely word from corpus1 is;",predict_next("a great", 2, corpus1_clean))

In [None]:
print("The most likely word from corpus1 is;",predict_next("a great", 2, corpus2_clean))