# Importing all the required libraries

In [3]:
import pandas as pd
import nltk
import numpy as np
import re
import string
import operator
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, SimpleRNN, LSTM, TimeDistributed
nltk.download('punkt')
#opened the donald trump speech file
speeches= open(r"speeches.txt", encoding="utf8")
speeches = speeches.read()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SREEJITH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
# used the sentence tokenizer and converted everything to lowercase and then removed "\n", " \' " and unnecessary punctuations.
cleanedsentences=[]
tokenizedsentences = nltk.sent_tokenize(speeches)
for i in tokenizedsentences:
    i=i.lower()
    cleansentence=re.sub("\n", ' ', i)
    cleansentence=re.sub("\’", '', i)
    cleansentence= " ".join("".join([" " if ch in string.punctuation else ch for ch in cleansentence]).split())
    cleanedsentences.append(cleansentence)

# Dividing the dataset into 80% train and 20% test data

In [33]:
train_data = cleanedsentences[:int(len(cleanedsentences)*0.8)]
test_data = cleanedsentences[int(len(cleanedsentences)*0.8):]

# Classical N gram approach

In [43]:
#general ngram function with n>=1:
def ngram(n):
    ngram_dict = {}
    total_ngrams = 0
    for sentence in train_data:
      words = nltk.word_tokenize(sentence)
      ngram_list = ngrams(words, n)
      for ngram_combination in ngram_list:
        total_ngrams += 1
        if ngram_combination in ngram_dict:
            ngram_dict[ngram_combination] += 1
        else:
            ngram_dict[ngram_combination] = 1
    ngram_prob_dict = {}
    for key, val in reversed(sorted(ngram_dict.items(), key = operator.itemgetter(1))):
      ngram_prob_dict[str(key)] = val / total_ngrams
    print ("Top 50 probability values in the sorted order:\n\n")
    print(list(ngram_prob_dict.items())[:50])

In [42]:
#unigram
ngram(1)

Top 50 probability values in the sorted order:
[("('the',)", 0.03439235267258775), ("('to',)", 0.03255858483201508), ("('and',)", 0.03210198400424197), ("('i',)", 0.02859646797166129), ("('a',)", 0.02132767737469253), ("('you',)", 0.01840395917104856), ("('of',)", 0.01711516651201155), ("('we',)", 0.015892654618296437), ("('it',)", 0.014353467956932231), ("('that',)", 0.013653837656312139), ("('have',)", 0.013602285949950658), ("('they',)", 0.012821645825048238), ("('going',)", 0.012320857820393855), ("('in',)", 0.011864256992620741), ("('so',)", 0.009109922967021638), ("('is',)", 0.009036277672219522), ("('but',)", 0.00829982472419837), ("('know',)", 0.007916869191227372), ("('were',)", 0.007835859366945046), ("('people',)", 0.0077622140721429304), ("('–',)", 0.007526549128776163), ("('its',)", 0.00741608118657299), ("('be',)", 0.007150958125285376), ("('are',)", 0.006318766294021475), ("('for',)", 0.006208298351818302), ("('not',)", 0.006038914173773438), ("('this',)", 0.006002091526

In [44]:
#bigram
ngram(2)

  


Top 50 probability values in the sorted order:


[("('going', 'to')", 0.012341348820516167), ("('you', 'know')", 0.005522203008254755), ("('we', 'have')", 0.004510750758589187), ("('were', 'going')", 0.004274201442135143), ("('and', 'i')", 0.00391529903096349), ("('to', 'be')", 0.003907142157982316), ("('of', 'the')", 0.0038255734281705763), ("('want', 'to')", 0.0032790629384319226), ("('have', 'to')", 0.0030425136219778787), ("('to', 'do')", 0.0026428268459003554), ("('in', 'the')", 0.0025857287350321383), ("('i', 'dont')", 0.0025775718620509643), ("('a', 'lot')", 0.002479689386276877), ("('i', 'think')", 0.0022757675617475284), ("('have', 'a')", 0.0021534144670299193), ("('i', 'mean')", 0.0021452575940487453), ("('and', 'they')", 0.002055531991255832), ("('i', 'have')", 0.001990277007406441), ("('i', 'said')", 0.001941335769519397), ("('lot', 'of')", 0.0018679239126888317), ("('and', 'we')", 0.0017129433260465268), ("('and', 'you')", 0.0016232177232536134), ("('by', 'the')", 0.001509

In [45]:
#trigram
ngram(3)

  


Top 50 probability values in the sorted order:


[("('were', 'going', 'to')", 0.004664085629697108), ("('going', 'to', 'be')", 0.0024140286950580735), ("('a', 'lot', 'of')", 0.002076975631974493), ("('not', 'going', 'to')", 0.0015850603507173765), ("('we', 'have', 'to')", 0.001503074470507857), ("('going', 'to', 'have')", 0.0011386927806877705), ("('going', 'to', 'do')", 0.0011204736961967661), ("('by', 'the', 'way')", 0.001111364153951264), ("('one', 'of', 'the')", 0.001065816442723753), ("('im', 'going', 'to')", 0.00101115918925074), ("('i', 'want', 'to')", 0.0009838305625142338), ("('the', 'united', 'states')", 0.0009018446823047142), ("('are', 'going', 'to')", 0.0008654065133227055), ("('i', 'dont', 'know')", 0.000838077886586199), ("('and', 'you', 'know')", 0.000838077886586199), ("('youre', 'going', 'to')", 0.0008289683443406969), ("('theyre', 'going', 'to')", 0.0008198588020951948), ("('its', 'going', 'to')", 0.0008016397176041904), ("('and', 'were', 'going')", 0.000774311090867

In [46]:
#quadgram
ngram(4)

  


Top 50 probability values in the sorted order:


[("('and', 'were', 'going', 'to')", 0.0008613087791973423), ("('were', 'going', 'to', 'do')", 0.0006049668806267047), ("('were', 'going', 'to', 'have')", 0.0005844595287410537), ("('its', 'going', 'to', 'be')", 0.0005742058527982281), ("('we', 'are', 'going', 'to')", 0.0005331911490269261), ("('not', 'going', 'to', 'happen')", 0.0005126837971412752), ("('were', 'going', 'to', 'win')", 0.00048192276931279866), ("('were', 'not', 'going', 'to')", 0.00048192276931279866), ("('and', 'by', 'the', 'way')", 0.00046141541742714766), ("('thank', 'you', 'very', 'much')", 0.00045116174148432213), ("('i', 'dont', 'want', 'to')", 0.00043065438959867113), ("('were', 'going', 'to', 'be')", 0.00043065438959867113), ("('were', 'going', 'to', 'make')", 0.0004204007136558456), ("('make', 'america', 'great', 'again')", 0.0003998933617701946), ("('a', 'lot', 'of', 'people')", 0.0003486249820560671), ("('to', 'take', 'care', 'of')", 0.00032811763017041607), ("