# Importing all the required libraries

In [76]:
import pandas as pd
import nltk
import numpy as np
import re
import string
import operator
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, SimpleRNN, LSTM, TimeDistributed
nltk.download('punkt')

#opened the donald trump speech file
speeches= open(r"speeches.txt", encoding="utf8")
speeches = speeches.read()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SREEJITH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [78]:
# used the sentence tokenizer and converted everything to lowercase and then removed "\n", " \' " and unnecessary punctuations.
cleanedsentences=[]
tokenizedsentences = nltk.sent_tokenize(speeches)
for i in tokenizedsentences:
    i=i.lower()
    cleansentence=re.sub("\n", ' ', i)
    cleansentence=re.sub("\’", '', i)
    cleansentence= " ".join("".join([" " if ch in string.punctuation else ch for ch in cleansentence]).split())
    cleanedsentences.append(cleansentence)

# Dividing the dataset into 80% train and 20% test data

In [79]:
train_data = cleanedsentences[:int(len(cleanedsentences)*0.8)]
test_data = cleanedsentences[int(len(cleanedsentences)*0.8):]

# 1)Classical N gram approach

In [129]:
#general ngram function with n>=1, printperplexity decides whether the perplexity should be printed(1) or not(0)
def ngram(n,printperplexity):
    ngram_dict = {}
    total_ngrams = 0
    prod=1
    uniquewords={}
    uniquewordcount=0
    for sentence in train_data:
      words = nltk.word_tokenize(sentence)
      for word in words:
            if word not in uniquewords:
                uniquewordcount+=1
      ngram_list = ngrams(words, n)
      for ngram_combination in ngram_list:
        total_ngrams += 1
        if ngram_combination in ngram_dict:
            ngram_dict[ngram_combination] += 1
        else:
            ngram_dict[ngram_combination] = 1
    ngram_prob_dict = {}
    for key, val in reversed(sorted(ngram_dict.items(), key = operator.itemgetter(1))):
      ngram_prob_dict[str(key)] = val / total_ngrams
      prod=prod*(val**(1/len(ngram_dict)))
    perplexity=product/total_ngrams
    if(printperplexity==0):
        print("\nTotal possible ngrams=")
        print(uniquewordcount**n)
        print("\n Ngrams that actually exist=")
        print(total_ngrams)
        print("\n MLE for top 50 :\n")
        print(list(ngram_prob_dict.items())[:50])
    elif(printperplexity==1):    
        print("Perplexity:")
        print(perplexity)    
    else:
        return total_ngrams,ngram_prob_dict

# a) maximum likelihood estimates of each model:

In [132]:
print("Unigram model")
ngram(1,0)

Unigram model

Total possible ngrams=
135786

 Ngrams that actually exist=
135786

 MLE for top 50 :

[("('the',)", 0.03439235267258775), ("('to',)", 0.03255858483201508), ("('and',)", 0.03210198400424197), ("('i',)", 0.02859646797166129), ("('a',)", 0.02132767737469253), ("('you',)", 0.01840395917104856), ("('of',)", 0.01711516651201155), ("('we',)", 0.015892654618296437), ("('it',)", 0.014353467956932231), ("('that',)", 0.013653837656312139), ("('have',)", 0.013602285949950658), ("('they',)", 0.012821645825048238), ("('going',)", 0.012320857820393855), ("('in',)", 0.011864256992620741), ("('so',)", 0.009109922967021638), ("('is',)", 0.009036277672219522), ("('but',)", 0.00829982472419837), ("('know',)", 0.007916869191227372), ("('were',)", 0.007835859366945046), ("('people',)", 0.0077622140721429304), ("('–',)", 0.007526549128776163), ("('its',)", 0.00741608118657299), ("('be',)", 0.007150958125285376), ("('are',)", 0.006318766294021475), ("('for',)", 0.006208298351818302), ("('not',

In [133]:
print("Bigram model")
ngram(2,0)

Bigram model


  



Total possible ngrams=
18437837796

 Ngrams that actually exist=
122596

 MLE for top 50 :

[("('going', 'to')", 0.012341348820516167), ("('you', 'know')", 0.005522203008254755), ("('we', 'have')", 0.004510750758589187), ("('were', 'going')", 0.004274201442135143), ("('and', 'i')", 0.00391529903096349), ("('to', 'be')", 0.003907142157982316), ("('of', 'the')", 0.0038255734281705763), ("('want', 'to')", 0.0032790629384319226), ("('have', 'to')", 0.0030425136219778787), ("('to', 'do')", 0.0026428268459003554), ("('in', 'the')", 0.0025857287350321383), ("('i', 'dont')", 0.0025775718620509643), ("('a', 'lot')", 0.002479689386276877), ("('i', 'think')", 0.0022757675617475284), ("('have', 'a')", 0.0021534144670299193), ("('i', 'mean')", 0.0021452575940487453), ("('and', 'they')", 0.002055531991255832), ("('i', 'have')", 0.001990277007406441), ("('i', 'said')", 0.001941335769519397), ("('lot', 'of')", 0.0018679239126888317), ("('and', 'we')", 0.0017129433260465268), ("('and', 'you')", 0.0016

In [134]:
print("Trigram model")
ngram(3,0)

Trigram model


  



Total possible ngrams=
2503600242967656

 Ngrams that actually exist=
109775

 MLE for top 50 :

[("('were', 'going', 'to')", 0.004664085629697108), ("('going', 'to', 'be')", 0.0024140286950580735), ("('a', 'lot', 'of')", 0.002076975631974493), ("('not', 'going', 'to')", 0.0015850603507173765), ("('we', 'have', 'to')", 0.001503074470507857), ("('going', 'to', 'have')", 0.0011386927806877705), ("('going', 'to', 'do')", 0.0011204736961967661), ("('by', 'the', 'way')", 0.001111364153951264), ("('one', 'of', 'the')", 0.001065816442723753), ("('im', 'going', 'to')", 0.00101115918925074), ("('i', 'want', 'to')", 0.0009838305625142338), ("('the', 'united', 'states')", 0.0009018446823047142), ("('are', 'going', 'to')", 0.0008654065133227055), ("('i', 'dont', 'know')", 0.000838077886586199), ("('and', 'you', 'know')", 0.000838077886586199), ("('youre', 'going', 'to')", 0.0008289683443406969), ("('theyre', 'going', 'to')", 0.0008198588020951948), ("('its', 'going', 'to')", 0.0008016397176041904

In [135]:
print("Quadgram model")
ngram(4,0)

Quadgram model


  



Total possible ngrams=
339953862591606137616

 Ngrams that actually exist=
97526

 MLE for top 50 :

[("('and', 'were', 'going', 'to')", 0.0008613087791973423), ("('were', 'going', 'to', 'do')", 0.0006049668806267047), ("('were', 'going', 'to', 'have')", 0.0005844595287410537), ("('its', 'going', 'to', 'be')", 0.0005742058527982281), ("('we', 'are', 'going', 'to')", 0.0005331911490269261), ("('not', 'going', 'to', 'happen')", 0.0005126837971412752), ("('were', 'going', 'to', 'win')", 0.00048192276931279866), ("('were', 'not', 'going', 'to')", 0.00048192276931279866), ("('and', 'by', 'the', 'way')", 0.00046141541742714766), ("('thank', 'you', 'very', 'much')", 0.00045116174148432213), ("('i', 'dont', 'want', 'to')", 0.00043065438959867113), ("('were', 'going', 'to', 'be')", 0.00043065438959867113), ("('were', 'going', 'to', 'make')", 0.0004204007136558456), ("('make', 'america', 'great', 'again')", 0.0003998933617701946), ("('a', 'lot', 'of', 'people')", 0.0003486249820560671), ("('to'

# b) Perplexity for test data:

In [124]:
#n is the order of the n-gram(uni,bi,tri etc.)
def perplexity(n):    
    ngram_dict = {}
    total_ngrams = 0
    prod=1
    uniquewords={}
    uniquewordcount=0
    for sentence in test_data:
      words = nltk.word_tokenize(sentence)
      for word in words:
            if word not in uniquewords:
                uniquewordcount+=1
      ngram_list = ngrams(words, n)
      for ngram_combination in ngram_list:
        total_ngrams += 1
        if ngram_combination in ngram_dict:
            ngram_dict[ngram_combination] += 1
        else:
            ngram_dict[ngram_combination] = 1
    ngram_prob_dict = {}
    for key, val in reversed(sorted(ngram_dict.items(), key = operator.itemgetter(1))):
      ngram_prob_dict[str(key)] = val / total_ngrams
      prod=prod*(val**(1/len(ngram_dict)))
    perplexity=product/total_ngrams
    print(perplexity)

In [125]:
print("Perplexity of Unigram:")
perplexity(1)

Perplexity of Unigram:
8.143323838173156e-10


In [126]:
print("Perplexity of Bigram:")
perplexity(2)

Perplexity of Bigram:
9.025873947967283e-10


In [127]:
print("Perplexity of Trigram:")
perplexity(3)

Perplexity of Trigram:


  


1.0093963884643966e-09


In [128]:
print("Perplexity of Quadgram:")
perplexity(4)

Perplexity of Quadgram:


  


1.1376241272664655e-09


# As we see, the perplexity increases with increase in the value of n

# c) Generator function

In [138]:
def Generator(n,lengthofsentence):
#Generate using unigram
    total_num_of_ngrams,ngram_prob_dict=ngram(n,2)
    multi_list = np.random.multinomial(total_num_of_ngrams//lengthofsentence, [1/lengthofsentence]*lengthofsentence, size=1)
    sentencetobeprinted = ''
    idx = 0
    for key, val in reversed(sorted(ngram_prob_dict.items(), key = operator.itemgetter(1))):
      for j in multi_list[0]:
        if idx == j:
          sentencetobeprinted += key[2:-2]
          sentencetobeprinted += ' '
      idx += 1
    print(sentencetobeprinted)

In [176]:
print("Unigram Model Generation: \n")
for i in range(6):
    Generator(1,4+i)

Unigram Model Generation: 


deductability' 
containing' pentagon' clearsighted' differences' happiness' dan' 
asleep' weakened' weakening' abandoned' deploy' england' lightweight' 
disavow' hits' admitted' accomplished' indiana' dominant' marine' circumstance' 
paul' quite' traveled' field' piece' someone' post' mistakes' jr' 


In [178]:
print("Bigram Model Generation: \n")
for i in range(6):
    Generator(2,4+i)

Bigram Model Generation: 



  


than', 'he the', 'brain the', 'trigger leaving', 'and 
who', 'wants im', 'number want', 'us them', 'right the', 'rest 
for', 'office in', 'oh the', 'chinese everybody', 'wanted trump', 'administration gotten', 'to 
think', 'this smartest', 'people of', 'bad so', 'proud they', 'going wouldnt', 'even told', 'you 
deficit', 'with to', 'live from', 'mexico for', 'people wonderful', 'people talented', 'people so', 'beautiful me', 'just 
to', 'cut everybody', 'in is', 'i a', 'really because', 'our to', 'straighten country', 'that to', 'hit way', 'we 


In [179]:
print("Trigram Model Generation: \n")
for i in range(6):
    Generator(3,4+i)

Trigram Model Generation: 



  


of', 'the', 'commentators have', 'millions', 'and a', 'special', 'person an', 'ivy', 'league 
beating', 'hillary', 'clinton i', 'saw', 'a i', 'read', 'it i', 'happen', 'to cheapest', 'thing', 'you 
well', 'have', 'to they', 'say', 'we they', 're', 'doing come', 'back', 'and them', 'and', 'we on', 'the', 'other 
impossible', 'for', 'our a', 'free', 'trader actually', 'a', 'very came', 'to', 'me so', 'many', 'others because', 'we', 'are get', 'the', 'gulf 
is', 'a', 'big look', 'forward', 'to or', 'the', 'other if', 'i', 'would theyre', 'great', 'people but', 'if', 'i because', 'they', 'dont it', 'would', 'have 
dont', 'know', 'who said', 'were', 'going is', 'a', 'very a', 'great', 'feeling you', 'know', 'with to', 'thank', 'my then', 'they', 'said have', 'great', 'respect a', 'president', 'that 


In [181]:
print("Quadgram Model Generation: \n")
for i in range(6):
    Generator(4,4+i)

Quadgram Model Generation: 



  


putting', 'the', 'security', 'of with', 'deep', 'pockets', 'have indiana', 'and', 'ohio', 'whose off', 'and', 'just', 'rapidly 
wounded', 'and', 'their', 'families for', 'our', 'nations', 'loss single', 'american', 'to', 'live live', 'in', 'peace', 'and the', 'only', 'reason', 'the 
we', 'have', 'to', 'protect do', 'that', 'to', 'ford 18', 'trillion', 'in', 'debt he', 'couldnt', 'answer', 'the building', 'all', 'over', 'the and', 'frankly', 'i', 'think 
on', 'the', 'second', 'amendment the', 'tax', 'is', 'so be', 'a', 'country', 'again thats', 'what', 'were', 'going are', 'going', 'to', 'happen are', 'going', 'to', 'happen they', 'get', 'five', 'of 
country', 'in', 'the', 'world they', 'would', 'have', 'said we', 'dont', 'know', 'about we', 'have', '28', '000 couldnt', 'get', 'an', 'environmental she', 'had', 'a', 'tough think', 'we', 're', 'going killing', 'us', 'at', 'the 
allies', 'in', 'the', 'region what', 'do', 'you', 'mean couple', 'of', 'months', 'ago not', 'going', 'to', 'forg

# as we see most of the sentences are unreadable

# 2) Neural Approach

In [198]:
# we take a list of all the characters in all the words
characters = list(set(speeches))
vocabularysize = len(characters)
sequencelength = 100
index_to_char = {index:char for index, char in enumerate(characters)}
char_to_index = {char:index for index, char in enumerate(characters)}
X = np.zeros((len(speeches)//sequencelength, sequencelength, vocabularysize))
Y = np.zeros((len(speeches)//sequencelength, sequencelength, vocabularysize))

In [199]:
for i in range(0, len(speeches)//sequencelength):
    X_sequence = speeches[i*sequencelength:(i+1)*sequencelength]
    X_sequence_index = [char_to_index[value] for value in X_sequence]
    input_sequence = np.zeros((sequencelength, vocabularysize))
    for j in range(sequencelength):
        input_sequence[j][X_sequence_index[j]] = 1.
    X[i] = input_sequence

    Y_sequence = speeches[i*sequencelength+1:(i+1)*sequencelength+1]
    Y_sequence_index = [char_to_index[value] for value in Y_sequence]
    output_sequence = np.zeros((sequencelength, vocabularysize))
    for j in range(sequencelength):
        output_sequence[j][Y_sequence_index[j]] = 1.
    Y[i] = output_sequence

In [200]:
# this is the model for generating the text
def neuralgenerator(model, length):
    ix = [np.random.randint(vocabularysize)]
    y_char = [index_to_char[ix[-1]]]
    X = np.zeros((1, length, vocabularysize))
    for i in range(length):
        X[0, i, :][ix[-1]] = 1
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)

In [201]:
# we use the generate text model on our neural network  to generate 5 sentences
def buildneuralnetwork(model,n):
    epochcount = 0
    number_of_sentences_to_be_printed = n
    while number_of_sentences_to_be_printed > 0:
        print('\n')
        number_of_sentences_to_be_printed -= 1
        model.fit(X, Y, batch_size = 200, verbose = 1, nb_epoch=1)
        epochcount += 1
        neuralgenerator(model, 80)
        if epochcount % 10 == 0:
            model.save_weights('checkpoint_{}_epoch_{}.hdf5'.format(5, epochcount))

# a) Vanilla RNN Based Neural Approach

In [202]:
model = Sequential()
dimensions = 500
RNN = SimpleRNN(dimensions, input_shape=(None, len_of_vocabulary), return_sequences=True)
denselayer = Dense(len_of_vocabulary, name='dense')
model.add(RNN)
model.add(denselayer)
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_9 (SimpleRNN)     (None, None, 500)         296500    
_________________________________________________________________
dense (Dense)                (None, None, 92)          46092     
_________________________________________________________________
activation_10 (Activation)   (None, None, 92)          0         
Total params: 342,592
Trainable params: 342,592
Non-trainable params: 0
_________________________________________________________________
None


In [203]:
buildneuralnetwork(model,5)





  


Epoch 1/1
g the                                                                           

Epoch 1/1
 Iou the  ao  an  ao  an  ao  an  ao  an  ao  an  ao  an  ao  an  ao  an  ao  an

Epoch 1/1
quthe wert thet thet thetheng the theng the theng the theng the theng the theng 

Epoch 1/1
RERENGOTHENERENERENTHENE THE The pre the the per and the wand the wand the were 

Epoch 1/1
ND THENG ING TOUNE TOENG TOUND IN The the walling to be the wand the wand the wa

# b)LSTM based neural approach

In [204]:
model = Sequential()
num_of_dimensions = 500
denselayer = Dense(len_of_vocabulary, name='dense')
model.add(LSTM(num_of_dimensions, input_shape=(None, len_of_vocabulary), return_sequences=True))
model.add(LSTM(num_of_dimensions, return_sequences=True))
model.add(denselayer)
model.add(TimeDistributed(Dense(len_of_vocabulary)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, None, 500)         1186000   
_________________________________________________________________
lstm_8 (LSTM)                (None, None, 500)         2002000   
_________________________________________________________________
dense (Dense)                (None, None, 92)          46092     
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 92)          8556      
_________________________________________________________________
activation_11 (Activation)   (None, None, 92)          0         
Total params: 3,242,648
Trainable params: 3,242,648
Non-trainable params: 0
_________________________________________________________________
None


In [205]:
buildneuralnetwork(model,5)





  


Epoch 1/1
_                                                                               

Epoch 1/1
z                                                                               

Epoch 1/1
d                                                                               

Epoch 1/1
8 oote tore tore tore tore tore tore tore tore tore tore tore tore tore tore tor

Epoch 1/1
re tore to to to to to to to to to to to to to to to to to to to to to to to to 

# As we can see the sentences are not readable

# c) Neural networks indeed do perform better than classic language models such as ngram since it doesn't suffer from the curse of dimensionality. They achieve this by representing words in a distributed way as a non linear combination of weights in a neural net