In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout,SimpleRNN 
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as utils
import re
from keras.callbacks import ModelCheckpoint
import math
import codecs
import random
from nltk.tokenize import sent_tokenize

# Classical Approach - N Grams

In [7]:
    
f = codecs.open('speech.txt', 'r', 'UTF-8')
raw_text = f.read()
text_data = text_data.replace("SPEECH", "")
text_data = text_data.replace("\'", "")
text_data = text_data.replace(",", "")
text_data = text_data.replace("\r\n", "")
text_data = re.sub('[0-9]', r'', text_data)
text = text_data
sents = sent_tokenize(text)
new_list = []
for sent in sents:
    sent = re.sub(r'\[(?:[^\]|]*\|)?([^\]|]*)\]', r'\1', sent)
    sent = re.sub('[\"]', r'',sent)
    sent = re.sub('[.]', r'',sent)
    new_list.append('<s> ' + sent.lower() + ' </s>')
    
def predict(text, N, freq_count):
    
    token_seq = ' '.join(text.split()[-(N-1):])
    if N!=1:
        choices = freq_count[token_seq].items()
    else:
        choices = freq_count[''].items()
    pvals=[]
    key_words=[]
    total = sum(weight for choice, weight in choices)
    
    for key,values in choices:
        key_words.append(key)
        pvals.append(values/total)
        
    r = np.random.multinomial(3, pvals, size=None)
    req_index = np.argmax(r)
    choice = key_words[req_index]
    
    return choice
        

In [8]:

def ngrams(sent_list, N):

    ngrams = []
    for sent in sent_list:
        tokens = sent.split()
        for i in range(len(tokens)-N+1):
            ngrams.append(tokens[i:i+N])

    freq_count = {}
    
    for i in ngrams:
        token_seq  = ' '.join(i[:-1])
        last_token = str(i[-1])

        if token_seq not in freq_count:
            freq_count[token_seq] = {};
        
        if last_token not in freq_count[token_seq]:
            freq_count[token_seq][last_token] = 0;

        freq_count[token_seq][last_token] += 1;
        
    return freq_count
   
def generator(N, freq_count):
    start_seq=None
    start_tag_list=[]
    
    for i in freq_count.keys():
        a = i.split()
        if N!=1 and a[0]=='<s>':
            start_tag_list.append(i)
    
    if(start_seq is None) and N!=1: 
        start_seq = random.choice(start_tag_list);
    elif(start_seq is None) and N==1:
        start_seq="<s>"
    rand_text = start_seq.lower();

    sentences = 0;
    
    next_word = ''
    
    while next_word!= '</s>':
        next_word = predict(rand_text, N, freq_count)
        rand_text += ' ' + next_word
        
    return rand_text

In [66]:
#Input 1 for unigram 2 for bigram 3 for trigram etc

N= int(input())
print("\n Printing output \n")
sent_list = new_list[:]
train_sent_list, test_sent_list = sent_list[:1000],sent_list[1000:]
freq_count = ngrams(train_sent_list, N)
print(generator(N, freq_count))
print(generator(N, freq_count))
print(generator(N, freq_count))
print(generator(N, freq_count))
print(generator(N, freq_count))

3

 Printing output 

<s> it’s going to be first </s>
<s> an incredible company </s>
<s> doctors are quitting the business </s>
<s> youre not going to be an amazing two monthswe might not even need the rhetoric </s>
<s> both our friends and now every one says the last time anybody saw us beating let’s say china in a certain way i wish i werent doing this but our country is in serious trouble and would be great </s>


### Testing perplexity

In [21]:
custom_perp = []

for sent in test_sent_list:
    ngrams = []
    tokens = sent.split()
    for i in range(len(tokens)-N+1):
        ngrams.append(tokens[i:i+N])


    for ngram in ngrams:
        token_seq  = ' '.join(ngram[:-1])
        last_token = str(ngram[-1])

        if token_seq not in freq_count:
            custom_perp.append(1)

        elif last_token not in freq_count[token_seq]:
            custom_perp.append(1)

        else:
            if N!=1:
                choices = freq_count[token_seq].items()
            else:
                choices = freq_count[''].items()
            pvals=[]
            key_words=[]
            total = sum(weight for choice, weight in choices)
            custom_perp.append(freq_count[token_seq][last_token]/total)

            
def smoothing(custom_perp, c):
    for i in range(len(custom_perp)):
        if custom_perp[i]==1:
            custom_perp[i]= c
    return custom_perp

# Smoothing
c = min(custom_perp)
custom_perp = smoothing(custom_perp,c)

In [15]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 1 gram:  378.04495410782584


In [12]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 2 gram:  171.27055085484562


In [18]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 3 gram:  75.76329766782628


In [22]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 4 gram:  29.360625575730996


### Readability of generated text

#### The text is mostly human readable and grammatically correct some of flaws is that some part of the sentences are getting repeated 

# Neural Approach

In [23]:
with open('speech.txt', 'r', encoding='utf-8') as file:
    data = file.read().replace('\n', '')
    
    
data=data.encode('ascii', 'ignore').decode('ascii')
data=data.lower()
x=re.split(r'(\.|,)', data)

In [24]:
final_data=[]
for i in x:
    if(i != '.' and i!=',' and i !='' and 'speech' not in i):
        final_data.append(i.strip())

train_data=final_data[:20000]
test_data=final_data[20000:]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_data)
total_words = len(tokenizer.word_index) + 1


input_sequences = []
for i in train_data:
    token_list = tokenizer.texts_to_sequences([i])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,   
                          maxlen=15, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = utils.to_categorical(label, num_classes=total_words)

In [25]:
def generate_text(seed_text, next_words, max_sequence_len, model):
    for j in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen= 
                             max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
  
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Simple RNN

In [28]:
input_len = 14
model_rnn=Sequential()
model_rnn.add(Embedding(total_words, 100, input_length=input_len))
model_rnn.add(SimpleRNN(300))
model_rnn.add(Dropout(0.2))
model_rnn.add(Dense(total_words, activation='softmax'))

model_rnn.summary()

checkpointer = ModelCheckpoint(filepath='weights_rnn.hdf5', verbose=1, save_best_only=True)
model_rnn.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
model_rnn.fit(predictors, label, batch_size=128, epochs=8, verbose=1,callbacks=[checkpointer])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 14, 100)           607000    
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 300)               120300    
_________________________________________________________________
dropout_3 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 6070)              1827070   
Total params: 2,554,370
Trainable params: 2,554,370
Non-trainable params: 0
_________________________________________________________________
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x23fe9995940>

### Sample Output

In [44]:
print(generate_text("i don't ", 8, 15, model_rnn))
print(generate_text("how", 8, 15, model_rnn))
print(generate_text("that was", 5, 15, model_rnn))
print(generate_text("trump", 8, 15, model_rnn))
print(generate_text("he", 6, 15, model_rnn))

i don't  know that i have a lot of money
how do you know what i want to do
that was a great guy with me
trump is going to be a lot of money
he said i was a great guy


### Perplexity

In [29]:
#testing
output_sequences = []
for line in test_data:
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        output_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in output_sequences])
output_sequences = np.array(pad_sequences(output_sequences,   
                          maxlen=15, padding='pre'))

print(output_sequences.shape)
x_test, y_test = output_sequences[:,:-1],output_sequences[:,-1]
y_test = utils.to_categorical(y_test, num_classes=total_words)

score = model_rnn.evaluate(x_test, y_test, verbose=False) 
print('Test score: ', score[0])    
print('Test accuracy: ', score[1])
print("Perplexity ", np.exp(score[0]))

(23882, 15)
Test score:  5.2604871629201115
Test accuracy:  0.19659157524994594
Perplexity  192.57528398691878


# LSTM Network


In [45]:
input_len = 14
model=Sequential()
model.add(Embedding(total_words, 100, input_length=input_len))
model.add(LSTM(300))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

model.summary()

checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(predictors, label, batch_size=128, epochs=8, verbose=1,callbacks=[checkpointer])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 14, 100)           607000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 300)               481200    
_________________________________________________________________
dropout_4 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 6070)              1827070   
Total params: 2,915,270
Trainable params: 2,915,270
Non-trainable params: 0
_________________________________________________________________
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x23fe9b30198>

### Sample Output

In [63]:
print(generate_text("i don't", 11, 15, model))
print(generate_text("how", 9, 15, model))
print(generate_text("that was", 6, 15, model))
print(generate_text("trump", 6, 15, model))
print(generate_text("he", 7, 15, model))

i don't want to do it because i dont want to do it
how does you think its going to be a wall
that was a disaster that was a disaster
trump is going to be a wall
he was a disaster for the united states


### Perplexity

In [48]:
output_sequences = []
for line in test_data:
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        output_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in output_sequences])
output_sequences = np.array(pad_sequences(output_sequences,   
                          maxlen=15, padding='pre'))

x_test, y_test = output_sequences[:,:-1],output_sequences[:,-1]
y_test = utils.to_categorical(y_test, num_classes=total_words)

score = model.evaluate(x_test, y_test, verbose=False)
print('Test score: ', score[0])    
print('Test accuracy: ', score[1])
print("Perplexity ", np.exp(score[0]))

Test score:  5.1948463521244514
Test accuracy:  0.20638974960720247
Perplexity  180.34043174397908


### Readability of neural generated text

#### The text is quite readable but at some places it is grammatically wrong.

## Does Neural Network better than Classical Approach, if so why? If not why not?

As far as perplexity of the model is concerned LSTM works better than RNN and unigram. On the other hand bigram works better than LSTM.
For our dataset classical nlp techniques works better than neural network approach as far as grammatical correct sentences are concerned. But overall, I would still say neural network method works better than classical approach because it is able to generalize better. It can produce sentences that it has never seen, also it takes the context of all of the previous words in the sentences to produce output and hence more probablity of producing sematically correct sentences but in the case of n-grams it takes the context of only previous n-words. 