# Language Modeling

**Importing Libraries**

In [1]:
"Libraries"
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
import requests
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
from nltk import ngrams
import re
from sklearn.model_selection import train_test_split
from collections import defaultdict

**Fetching data**

In [2]:
def strip_unwanted(text):
    p = re.compile("(\n|\xa0|\r|\t|\s|\\|\"|,|;|:|\.|\?|\!|_|\)|\(|-|\[|\])+")
    w = p.sub(' ', text)
    p = re.compile("speech [0-9]")
    return p.sub('', w)

def clean_trump(link):
    r = requests.get(link)
    data = r.text
    with open("trump_speeches.txt", "w", encoding="utf-8") as f:
        f.write(data)
    data = data.lower()
    sent_tokenized = sent_tokenize(data)
    sentences = []
    for i in range(len(sent_tokenized)):
        sent = ("<s> " + strip_unwanted(sent_tokenized[i]).strip() + " </s>").encode('ascii', 'ignore').decode()
        sentences.append(sent)
    return sentences

link = "https://raw.githubusercontent.com/ryanmcdermott/trump-speeches/master/speeches.txt"
sentences = clean_trump(link)
data_train, data_test = train_test_split(sentences, train_size=0.8, shuffle=False)

In [3]:
"""
Using tf.keras tokenizer I will assign indices to each type in the training data. Also, I will create two utility dictionaries
which will held the mappings of word -> index and index->word
"""
tokenizer = Tokenizer(num_words=None, filters=[], lower=True, split=" ") # data is already filtered, so no need to filter any thing else
tokenizer.fit_on_texts(data_train) # fitting indices to words
word_index = tokenizer.word_index # word -> index
index_word = {} # index -> word
for word in word_index:
    index_word[word_index[word]] = word

vocab_size = len(tokenizer.word_index) # Vocabulary size
print('Vocabulary Size:', vocab_size)
data_train = tokenizer.texts_to_sequences(data_train) # maps all words in the text to their respective indices.

Vocabulary Size: 5695


## Ngram Language model


In [14]:
# Ngram model with add-k smoothing
class ngram_model:
    def __init__(self, ngram_size, data, tokenizer, word_index, index_word):
        self.n = ngram_size
        self.sentences = data
        self.tokenizer = tokenizer
        self.wordtoindex = word_index
        self.indextoword = index_word
        self.vocab = len(self.wordtoindex)
        self.tokens = 0
    
    def train(self):
        self.calculate_count()
        self.calculate_mle()
    
    def evaluate(self, data):
        data1 = self.tokenizer.texts_to_sequences(data)
        min_prob= min(list(self.mle_ngrams.values()))
        ans = 0
        N = 1
        probs = []
        for sent in data1:
            sent1 = []
            for w in sent:
                if w in self.wordtoindex:
                    sent1.append(self.wordtoindex[w])
                else:
                    sent1.append(100000)
            ngram = ngrams(sent1, self.n)
            for gr in ngram:
                N+=1
                if gr in self.mle_ngrams:
                    probs.append(self.mle_ngrams[gr])
                else:
                    probs.append(min_prob)
        probs = np.asarray(probs)
        ans = np.prod(np.power(probs, 1/N))
        return 1/ans
        
    def calculate_count(self):
        self.count_ngrams = defaultdict(int)
        self.count_n_1_grams = defaultdict(int)
        for sent in self.sentences:
            ngram = ngrams(sent, self.n)
            for gr in ngram:
                self.count_ngrams[gr]+=1
                if self.n==1:
                    self.tokens+=1
            if self.n!=1:
                n_1_gram = ngrams(sent, self.n-1)
                for gr in n_1_gram:
                    self.count_n_1_grams[gr]+=1
        
    def calculate_mle(self):
        self.mle_ngrams = defaultdict(int)
        for ngram in self.count_ngrams:
            self.mle_ngrams[ngram] = (self.count_ngrams[ngram])/( (self.tokens) if self.n==1 else (self.count_n_1_grams[ngram[:-1]]) )
    
    def generate_text(self, nsent):
        flag = 1
        prev = ('<s>',)
        started = 0
        ans = ""
        while nsent>0:
            selected = []
            if flag and self.n!=1:
                flag=0
                prev = ('<s>',)
                for ngram in self.mle_ngrams:
                    if ngram[0]=="<s>":
                        selected.append((ngram, self.mle_ngrams[ngram]))
            if flag==0 or self.n==1:
                for ngram in self.mle_ngrams:
                    if ngram[:len(prev)-1]==prev[1:]:
                        if ngram!=(self.wordtoindex["<s>"],):
                            selected.append((ngram, self.mle_ngrams[ngram]))
                            
            prob = [i[1] for i in selected]
            a=sum(prob)
            prob = [i/a for i in prob]
            out = list(np.random.multinomial(10,prob))
            i = out.index(max(out))
            prev = selected[i][0]
            if started:
                if self.indextoword[prev[-1]]=="</s>":
                    ans+= ". "
                else:
                    ans+=' '+self.indextoword[prev[-1]]
            else:
                started = 1
                w = list(prev)
                w = [self.indextoword[i] for i in w]
                if self.n==1:
                    w[0] = w[0].capitalize()
                    s = " ".join(w[:-1])
                else:
                    w[1] = w[1].capitalize()
                    s = " ".join(w[1:-1])
                ans+= s
                if w[-1]=="</s>":
                    ans+= ". "
                else:
                    ans+=" "+w[-1]
                    
            if self.indextoword[prev[-1]]=="</s>":
                started=0
                flag=1
                nsent-=1
        return ans

Creating models for unigram, bigram, trigram and quadgram

In [15]:
# Calculating Unigrams, bigrams, trigrams and quadgrams
unigram_model = ngram_model(ngram_size=1, data=data_train, tokenizer=tokenizer, index_word = index_word, word_index=word_index)
bigram_model = ngram_model(ngram_size=2, data=data_train, tokenizer=tokenizer, index_word = index_word, word_index=word_index)
trigram_model = ngram_model(ngram_size=3, data=data_train, tokenizer=tokenizer, index_word = index_word, word_index=word_index)
quadgram_model = ngram_model(ngram_size=4, data=data_train, tokenizer=tokenizer, index_word = index_word,word_index=word_index)
unigram_model.train()
bigram_model.train()
trigram_model.train()
quadgram_model.train()

In [16]:
print("Existing unigrams:", len(unigram_model.count_ngrams), "Possible unigrams:", unigram_model.vocab)
print("Existing bigrams:", len(bigram_model.count_ngrams), "Possible bigrams:", bigram_model.vocab**2)
print("Existing trigrams:", len(trigram_model.count_ngrams), "Possible trigrams:", trigram_model.vocab**3)
print("Existing quadgrams:", len(quadgram_model.count_ngrams), "Possible quadgrams:", quadgram_model.vocab**4)

Existing unigrams: 5695 Possible unigrams: 5695
Existing bigrams: 43793 Possible bigrams: 32433025
Existing trigrams: 83719 Possible trigrams: 184706077375
Existing quadgrams: 98446 Possible quadgrams: 1051901110650625


### Generating Text

In [17]:
print("Unigram model\n")
print(unigram_model.generate_text(5))
print("\n\nBigram model\n")
print(bigram_model.generate_text(5))
print("\n\nTrigram model\n")
print(trigram_model.generate_text(5))
print("\n\nQuadgram model\n")
print(quadgram_model.generate_text(5))
print()

Unigram model

.  A the. . .  I they. 


Bigram model

 The border.  For the country.  Of the oil.  Rate.  Myself. 


Trigram model

Bent or the yale but they say the words radical islam is coming in. Do anything. Not been a very nice as far as im concerned. Our country is going to be a fact we need to do that. Waste get rid of it. 


Quadgram model

Example trump saudi arabia they make $1 billion a day. The president get away with it again. If we have a lot of people dont even know what the hell is going on. The world hates us. Year ago or so and he gave me a plaque because i supported him. 



As we increase the ngram number the readability of the text increases. This is because of more context. Also, the readability of the quadgram model is actually very nice.

### Perplexity

In [18]:
print("Unigram model\n")
print(unigram_model.evaluate(data_test))
print("\n\nBigram model\n")
print(bigram_model.evaluate(data_test))
print("\n\nTrigram model\n")
print(trigram_model.evaluate(data_test))
print("\n\nQuadgram model\n")
print(quadgram_model.evaluate(data_test))
print()

Unigram model

160663.81660976555


Bigram model

13116.456552524489


Trigram model

1512.6519174092455


Quadgram model

503.89016718551636



# Neural Approach

In [34]:
class neural_model:
    def __init__(self, data, tokenizer, word_index, index_word, max_sequence_len=15, hiddenlayer="Vanilla RNN", epochs=20, batch_size=256):
        self.data= data
        self.tokenizer = tokenizer
        self.sentences = data
        self.wordtoindex = word_index
        self.indextoword = index_word
        self.max_sequence_len = max_sequence_len
        self.vocab = len(self.wordtoindex) + 1
        self.hidden = hiddenlayer
        self.epochs = epochs
        self.batch_size = batch_size
    
    def train(self):
        self.prepare_data()
        self.model = self.model_create()
        
    def evaluate(self, data):
        data1 = self.tokenizer.texts_to_sequences(data)
        input_sequences = []
        for line in data1:
            for i in range(1, len(line)):
                n_gram_sequence = line[:i+1]
                input_sequences.append(n_gram_sequence)
        input_sequences = np.array(pad_sequences(input_sequences,maxlen = self.max_sequence_len, padding='pre'))
        X, y = input_sequences[:,:-1],input_sequences[:,-1]
        y = to_categorical(y, num_classes=self.vocab)
        loss = self.model.evaluate(X,y, verbose=0)
        return np.exp(loss)
        
        
    def prepare_data(self):
        input_sequences = []
        for line in self.data:
            for i in range(1, len(line)):
                n_gram_sequence = line[:i+1]
                input_sequences.append(n_gram_sequence)
        input_sequences = np.array(pad_sequences(input_sequences,maxlen = self.max_sequence_len, padding='pre'))
        self.X, y = input_sequences[:,:-1],input_sequences[:,-1]
        self.y = to_categorical(y, num_classes=self.vocab)

    def model_create(self):
        input_len = self.max_sequence_len - 1
        model = Sequential()
        model.add(Embedding(self.vocab, 128, input_length=input_len))
        if self.hidden == "Vanilla RNN":
            model.add(SimpleRNN(256))
        elif self.hidden == "LSTM":
            model.add(LSTM(256))
        model.add(Dropout(0.2))
        model.add(Dense(self.vocab, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        print(model.summary())
        model.fit(self.X, self.y, epochs=self.epochs, verbose=1, batch_size=self.batch_size)
        return model
        
    def generate_text(self, n_sent):
        in_text, result = ["<s>"], ""
        sent_len=1
        started = 0
        while (n_sent>0):
            encoded = np.array(self.tokenizer.texts_to_sequences([in_text])[0])
            encoded = pad_sequences([encoded], maxlen= self.max_sequence_len-1, padding='pre')
            probs = list(np.transpose(self.model.predict_proba(encoded, verbose=0))[:,0])
            a=sum(probs)*1.01
            probs=[i/a for i in probs]
            out = list(np.random.multinomial(2,probs))
            i = out.index(max(out))
            out_word = self.indextoword[i]
            if out_word=="</s>" or sent_len == self.max_sequence_len:
                out_word = "."
                in_text = ["<s>"]
                n_sent-=1
                sent_len=1
                result = result + out_word
                started=0
            else:
                in_text+=[out_word]
                sent_len+=1
                if not started:
                    started = 1
                    out_word = out_word.capitalize()
                result = result + " " + out_word
        return result.strip()

In [35]:
model_rnn = neural_model(data=data_train, tokenizer=tokenizer, word_index=word_index, index_word=index_word, max_sequence_len=15, hiddenlayer="Vanilla RNN", epochs=20, batch_size=512)
model_rnn.train()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 14, 128)           729088    
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 256)               98560     
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 5696)              1463872   
Total params: 2,291,520
Trainable params: 2,291,520
Non-trainable params: 0
_________________________________________________________________
None
Train on 147594 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 

In [36]:
model_lstm = neural_model(data=data_train, tokenizer=tokenizer, word_index=word_index, index_word=index_word, max_sequence_len=15, hiddenlayer="LSTM", epochs=20, batch_size=512)
model_lstm.train()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 14, 128)           729088    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 5696)              1463872   
Total params: 2,587,200
Trainable params: 2,587,200
Non-trainable params: 0
_________________________________________________________________
None
Train on 147594 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 

### Generating Text using the neural models

In [37]:
print("Vanilla RNN model\n\n")
print(model_rnn.generate_text(5))
print("\n\nLSTM model\n\n")
print(model_lstm.generate_text(5))

Vanilla RNN model


Then i will certainly they will get away in the history of the gsa. Its got to be very very well. I dont know. I mean the veterans and the one thing. We have a situation we have to change.


LSTM model


I dont know it. And we. And i said "let me tell you. I was in last. And i have a great honor.


The data generated by these models are pretty readable. Most of the sentences are almost grammatically correct too. They actually seem like sentences from daily conversations.

### Perplexity of the neural models

For this, I am using the in-built function evaluate to calculate the cross-entropy of the test data. Then, perplexity is just exp(cross entropy)

In [38]:
print("Vanilla RNN model:")
print("Perplexity:", model_rnn.evaluate(data_test))
print("\nLSTM model:")
print("Perplexity:", model_lstm.evaluate(data_test))

Vanilla RNN model:
Perplexity: 117.25450446445656

LSTM model:
Perplexity: 104.8216095336064


Perplexity of the LSTM is slightly better than that of the Vanilla RNN model. This is because LSTM improve on the Vanilla RNN model by storing some memory. Hence, they perform better as they are expected to.

The Neural approach works better because of the power of RNNs. Also, there is more context covered in the RNNs than the ngram models. 