In [0]:
import nltk
nltk.download('punkt')
import math
import re
import numpy as np
frm nltk.util import ngrams

import gensim
from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation, SimpleRNN, Dropout
from keras.models import Sequential
from keras.utils.data_utils import get_file

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
!wget https://raw.githubusercontent.com/ryanmcdermott/trump-speeches/master/speeches.txt

--2019-10-06 12:56:10--  https://raw.githubusercontent.com/ryanmcdermott/trump-speeches/master/speeches.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 924745 (903K) [text/plain]
Saving to: ‘speeches.txt.1’


2019-10-06 12:56:10 (20.6 MB/s) - ‘speeches.txt.1’ saved [924745/924745]



**Text Preprocessing**

In [0]:
with open('speeches.txt') as f:
  text = f.read()
lower_text=nltk.sent_tokenize(text.lower())
sentences=[]
for line in lower_text:
  if re.search(r'[a-z0-9]', line):
    l=re.sub(r'[^a-z0-9]'," ", line)
  sentences.append(l)

**Train Test Split**

In [0]:
train = sentences[:math.ceil(0.8*len(sentences))]
test = sentences[math.ceil(0.8*len(sentences)):]

**Classic Modelling N grams**

In [0]:
def Ngram_model(n, data):
  ngram_list=[]
  for sentence in data:
    sentence=nltk.word_tokenize(sentence)
    padded_sent = list(['<s>']+sentence+['</s>'])
    ngram_list.extend(list(ngrams(padded_sent, n=n)))
  return ngram_list

In [0]:
def Freq_Dist(ngram_list):
  freq=nltk.FreqDist(ngram_list)
  freq_dist={}
  for key in freq:
    freq_dist[key]=freq[key]
  return freq_dist

In [0]:
def MLE_dict(n, data):
  mle_dict={}
  l1=Ngram_model(n, data)
  f1=Freq_Dist(l1)
  if(n!=1):
    l2=Ngram_model(n-1, data)
    f2=Freq_Dist(l2)
    for key in f1:
      x=(' '.join(key))
      y=f1[key]/f2[key[:-1]]
      mle_dict[x]=y
  else:
    for key in f1:
      x=(' '.join(key))
      y=f1[key]/len(l1)
      mle_dict[x]=y
  return mle_dict

In [0]:
def Generator(mle_dict):
  sentence=[]
  pvalues=np.array([mle_dict[key] for key in mle_dict])
  pvalues=pvalues/pvalues.sum()
  while(True):
    poss=list(np.random.multinomial(20, pvalues))
    start=list(mle_dict.keys())[poss.index(max(poss))]
    if '<s>' in start:
      break
    else:
      pass
  sentence=sentence+start.split(' ')
  while(True):
    poss=list(np.random.multinomial(20, pvalues))
    move=list(mle_dict.keys())[poss.index(max(poss))]
    if('</s>' not in move and '<s>' not in move):
      sentence=sentence+move.split(' ')
      pass
    elif('</s>' in move):
      sentence=sentence+move.split(' ')
      break
    
  return ' '.join(sentence)

In [0]:
def Perplexity(n, mle_dict, test):
  ngram_test=Ngram_model(n, test)
  perplexity=0
  N=len(ngram_test)
  for key in ngram_test:
    if key in mle_dict:
      perplexity=perplexity-(math.log(mle_dict(key))/N)
    else:
      perplexity=perplexity-(math.log(1/len(mle_dict))/N)
  perplexity=math.exp(perplexity)
  return perplexity

**Perplexity of N gram Models**

In [0]:
for n in range(1, 5):
  mle=MLE_dict(n, train)
  p=Perplexity(n, mle, test)
  print("Perplexity of "+str(n)+"-gram : "+str(p))

Perplexity of 1-gram : 5214.999999982188
Perplexity of 2-gram : 41145.99999998437
Perplexity of 3-gram : 80793.00000049338
Perplexity of 4-gram : 98677.9999992146


**Observation : Perplexity of unigram < bigram < trigram < quadgram**

**Random Text Generation of N gram Models**

In [0]:
for n in range(1, 5):
  mle=MLE_dict(n, train)
  print("Generation text : "+str(n)+"-gram")
  print('')
  for i in range(5):
    print(Generator(mle))
  print('')

Generation text : 1-gram

<s> </s>
<s> </s>
<s> s to </s>
<s> to and the </s>
<s> </s>

Generation text : 2-gram

<s> it executive orders place </s>
<s> so terraza floor resources are overturn the right now feet </s>
<s> they earlier </s>
<s> what infrastructure </s>
<s> you disaster </s>

Generation text : 3-gram

<s> containing the place is falling an army tank trafficking and the not fair because 2 trillion and to nothing </s>
<s> it was enemies something that made him look know somebody is off stage everybody tremendous and we have jeb bush china necessary to you and when the battleships we election is tougher for the wall coming for a very short period i like the highways and somebody sending people that importing extremism through be fantastic and persuasive but we to appreciate what first we need a house </s>
<s> in fact to be a rapidly expanded their cost of the say what </s>
<s> cause abraham and enemies must and medicaid without than making america statistic that s mean most 

**Observation : Readability of quadgram > trigram > bigram > unigram**

**Neural Language Modelling : LSTM and RNN**

In [0]:
def Neural_training(data, model_name):
  
  max_length=8
  sent_data=[['<s>']+line.split()[:max_length]+['</s>'] for line in data]
  
  word_model = gensim.models.Word2Vec(sent_data, size=10, min_count=1, iter=100)
  trained_vocabs = word_model.wv.vocab
  trained_weights = np.array(word_model.wv.vectors)
  vocab_size = word_model.wv.vectors.shape[0]
  embedding_size = word_model.wv.vectors.shape[1]
  
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[trained_weights]))
  if(model_name=='LSTM'):
    model.add(LSTM(units=embedding_size))
  if(model_name=='RNN'):
    model.add(SimpleRNN(units=embedding_size))
  model.add(Dropout(0.1))
  model.add(Dense(units=vocab_size))
  model.add(Activation('softmax'))
  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
  
  corpus=[]
  for i, line in enumerate(sent_data):
    for j in range(len(line)-2):
      for k in range(j+2,len(line)):
        corpus.append(line[j:k])
  
  train_X = np.zeros([len(corpus), max_length+2], dtype=np.int32)
  train_y = np.zeros([len(corpus)], dtype=np.int32)
  for i, line in enumerate(corpus):
    for j, word in enumerate(line[:-1]):
      train_X[i, j] = word_model.wv.vocab[word].index
    train_y[i]=word_model.wv.vocab[line[-1]].index
  model.fit(train_X, train_y, batch_size=128, epochs=5)
  
  return (word_model, model)

In [0]:
def Neural_generator(word_model, model):
  x=['<s>']
  i=0
  while(i<=10):
    i=i+1
    windex_x=[word_model.wv.vocab[word].index for word in x]
    preds = model.predict(x=np.array([windex_x]))
    preds=preds/preds.sum()
    p = np.random.multinomial(1, preds[0], 1)
    idx = np.argmax(p)
    w=word_model.wv.index2word[idx]
    if(w=='</s>'):
      x.append(w)
      break
    else:
      x.append(w)
      pass
  return(' '.join(x))

**Perplexity of Neural Models**

In [0]:
def Neural_perplexity(model, test):
  
  max_length=8
  sent_data=[['<s>']+line.split()[:max_length]+['</s>'] for line in test]
  
  word_model = gensim.models.Word2Vec(sent_data, size=10, min_count=1, iter=100)
  trained_vocabs = word_model.wv.vocab
  trained_weights = np.array(word_model.wv.vectors)
  vocab_size = word_model.wv.vectors.shape[0]
  embedding_size = word_model.wv.vectors.shape[1]
  
  corpus=[]
  for i, line in enumerate(sent_data):
    for j in range(len(line)-2):
      for k in range(j+2,len(line)):
        corpus.append(line[j:k])
        
  N=len(corpus)
  perplexity=0
  for i, line in enumerate(corpus):
    x=line[:-1]
    y=line[-1]
    windex_x=[word_model.wv.vocab[word].index for word in x]
    preds=model.predict(x=np.array([windex_x]))
    idx=word_model.wv.vocab[y].index
    prob=preds[0][idx]
    perplexity=perplexity-(math.log(prob)/N)
  perplexity=math.exp(perplexity)
  
  return perplexity

**Random Text Generation of Neural Models using start words**

In [0]:
rnn_word_model, rnn_model =Neural_training(train, 'RNN')






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
lstm_word_model, lstm_model =Neural_training(train, 'LSTM')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
print("Generation text : RNN")
for i in range(5):
  print(Neural_generator(rnn_word_model, rnn_model))

Generation text : RNN
<s> deals t break okay if the plaque planet government preparing cell
<s> in motorcycle equipment moines 500 <s> take simplify just wheelhouse does
<s> steal calling grabs same executive heh lower loss times easing prefer
<s> for test jets spent refugees vladimir deductions joke an stagnant letter
<s> together bonded brings rude vision leader truth forum potential expression 17th


In [0]:
print("Generation text : LSTM")
for i in range(5):
  print(Neural_generator(lstm_word_model, lstm_model))

Generation text : LSTM
<s> corporate sell precedent energy cheaply religion financing league discussing cranes missile
<s> wait ireland you get player ought calling fans home very affects
<s> bad ensure negative landscape within decapitating much isis right bought rude
<s> using highest joking mess space president 15 personal community watches scheduled
<s> follow tea passion agreement admit theory barn interesting of coast weight


**Observation : Readabilty of LSTM > RNN**

In [0]:
p_rnn=Neural_perplexity(rnn_model, test)
p_lstm=Neural_perplexity(lstm_model, test)

In [0]:
print("Perplexity RNN : "+str(p_rnn))
print("Perplexity LSTM : "+str(p_lstm))

Perplexity RNN : 1727.7609184691341
Perplexity LSTM : 1382.8191686225555


**Observation : Perplexity of RNN > LSTM**

**Observation : Neural Models perform better than Classical Models. Because of the recurrence property of these RNN Models**