In [1]:
import spacy
import pickle
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
# !pip install bcolz
import bcolz

In [2]:
# read files
with open('/content/drive/MyDrive/Colab Notebooks/prideAndPrejudice.txt','r') as f:
  text = [line.rstrip('\n') for line in f]
with open('/content/drive/MyDrive/Colab Notebooks/test_1.txt','r') as f:
  test1 = [line.rstrip('\n') for line in f]
with open('/content/drive/MyDrive/Colab Notebooks/test_2.txt','r') as f:
  test2 = [line.rstrip('\n') for line in f]
with open('/content/drive/MyDrive/Colab Notebooks/tweet.txt','r') as f:
  tweet = [line.rstrip('\n') for line in f] 

In [3]:
# preprocess
nlp = spacy.load("en_core_web_sm")
dics = Counter()
processed = []
for i in text:
  doc = nlp(i)
  sents = [sent.text for sent in doc.sents]
  for j in sents:
    doc = nlp(j)
    tokens = [token.text.lower() for token in doc]
    padded = ['<s>'] + tokens + ['</s>']
    dics.update(padded)
    processed += [padded]

In [4]:
# build vocabulary and mappings
vocab = {k:v for k,v in dics.items() if v>1}
vocab = sorted(vocab,key=vocab.get, reverse=True)
vocab = ['_PAD','_UNK']+vocab
word2idx = {o:i for i,o in enumerate(vocab)}
idx2word = {i:o for i,o in enumerate(vocab)}

In [5]:
# convert token sequences to integer sequences
def convert(text, mapping, vocab, seq_len = 5):
  sequences = []
  for sentence in text:
    if len(sentence) < seq_len:
      sequences += [[mapping[word] if word in vocab or word in ['_PAD','_UNK','<s>','</s>'] else mapping['_UNK'] for word in sentence] + [0]*(seq_len-len(sentence))]
    else:
      sequences += [[mapping[word] if word in vocab or word in ['_PAD','_UNK','<s>','</s>'] else mapping['_UNK'] for word in sentence]]
  return sequences

In [6]:
# create sequences of length 5 tokens
def create_seq(text, seq_len = 5):
    sequences = []
    # if the number of tokens in 'text' is greater than 5
    if len(text) > seq_len:
      for i in range(seq_len, len(text)+1):
        # select sequence of tokens
        seq = text[i-seq_len:i]
        # add to the list
        sequences += [seq]
      return sequences
    # if the number of tokens in 'text' is less than or equal to 5
    else:
      return [text]

In [7]:
def in_out_data(text, mapping, vocab, seq_len = 5):
  data = convert(text, mapping, vocab, seq_len =seq_len)
  seqs = [create_seq(i,seq_len=seq_len) for i in data]
  seqs = sum(seqs,[])

  # create inputs and targets (x and y)
  x = []
  y = []

  for s in seqs:
    x.append(s[:-1])
    y.append(s[1:])

  return np.array(x),np.array(y)

In [8]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n,:]
      y = arr_y[prv:n,:]
      prv = n
      yield x, y

In [9]:
class WordLSTM(nn.Module):
    
    def __init__(self, vocab, pretrain = None, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        if pretrain is None:
          self.emb_layer = nn.Embedding(len(vocab), 200)

          ## define the LSTM
          self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
          ## define the fully-connected layer
          self.fc = nn.Linear(n_hidden, len(vocab))
        else:
          (num,d) = pretrain.shape
          self.emb_layer = nn.Embedding.from_pretrained(torch.from_numpy(pretrain).float(),freeze=False)

          ## define the LSTM
          self.lstm = nn.LSTM(d, n_hidden, n_layers, 
                              dropout=drop_prob, batch_first=True)
          ## define the fully-connected layer
          self.fc = nn.Linear(n_hidden, num)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)

    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [10]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to GPU
    net.cuda()
    
    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # update weigths
            opt.step()            
            
            # if counter % print_every == 0:
            
              # print("Epoch: {}/{}...".format(e+1, epochs),
              #       "Step: {}...".format(counter))

In [11]:
# predict next token
def predict(net, tkn, h=None):
         
  # tensor inputs
  x = np.array([[word2idx[tkn]]])
  inputs = torch.from_numpy(x)
  
  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  out, h = net(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char, the hidden state, 
  # and the probability distribution of the predicted token
  return idx2word[sampled_token_index], h , p


# function to generate text
def sample(net, prime='<s>'):
        
    # push to GPU
    net.cuda()
    
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h , _ = predict(net, t, h)
    
    toks.append(token)

    while True:
      token, h , _ = predict(net, toks[-1], h)
      toks.append(token)
      if token == '</s>':
        break


    return ' '.join(toks)

In [12]:
x_int,y_int = in_out_data(processed,word2idx,vocab)
net1 = WordLSTM(vocab)
net1.cuda()
print(net1)
train(net1, batch_size = 32, epochs=20, print_every=256)

WordLSTM(
  (emb_layer): Embedding(3913, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=3913, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [13]:
torch.save(net1.state_dict(),'/content/drive/MyDrive/Colab Notebooks/net1')

2.1: Output

In [14]:
for i in range(10):
  print(sample(net1))

<s> " you must not have _UNK the subject of her own , she could be prevailed to her husband and lydia , though he was gone in her way to be _UNK , by the world ; and , though the _UNK and her husband was _UNK by the _UNK . </s>
<s> she had no longer to speak for his _UNK . ' that he had been so well as they had gone to longbourn . " said she , as soon , i hope you are mistaken . ' you will be a _UNK thing , and she could be in a similar _UNK to be sure ! " </s>
<s> i have no notion of the subject , she was not so far as she had ever inspired at her with her _UNK , and she had been _UNK , by her nephew , who had given her , when her marriage had been _UNK , she dropt all the room , was in a different room . ' she is very much , and i hope it was to get a sheet of _UNK and her daughter , she had been so happy at the same _UNK . " cried she was not to the library ; but , as she could not be so much as she was in a few days at the door , she was not to the library , she was almost ashamed 

In [15]:
# calculate the probability for a test sentence
def prob4sent(text,net,vocab):
  sentence = text.split()
  h = net.init_hidden(1)
  prob = np.log(1/len(vocab))
  for i in range(len(sentence)-1):
    prev = sentence[i] if sentence[i] in vocab else '_UNK'
    pred = sentence[i+1] if sentence[i+1] in vocab else '_UNK'
    _, h, p = predict(net, prev, h)
    prob += np.log(p[word2idx[pred]])
  # return the probability and the number of words in the sentence
  return prob,len(sentence)

In [16]:
# calculate perplexity
def perplexity(test,net,vocab):
  N,res = 0,0
  for i in test:
    p, n = prob4sent(i,net,vocab)
    N+=n
    res+=p
  return np.e**(-1/N*res)

2.2 Perplexity

In [17]:
p = perplexity(test1,net1,vocab)
print('The perplexity for part 2.2 is', p)

The perplexity for part 2.2 is 157.4577520315396


In [18]:
# with seq_len 25
x_int,y_int = in_out_data(processed,word2idx,vocab,seq_len=25)
net2 = WordLSTM(vocab)
net2.cuda()
print(net2)
train(net2, batch_size = 32, epochs=20, print_every=256)

WordLSTM(
  (emb_layer): Embedding(3913, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=3913, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [19]:
torch.save(net2.state_dict(),'/content/drive/MyDrive/Colab Notebooks/net2')

2.3 Output <br>
We can see that the outputs are longer than before with less UNKs.

In [20]:
for i in range(10):
  print(sample(net2))

<s> " i am not to say you will not be in company . </s>
<s> i can never help laughing . </s>
<s> i shall not have _UNK so much as i ought for the trouble that must become up it ; and i shall have hated you at pemberley . ' , and she had not seen that she had not seen the place ; though , though the sake of her nephew 's letter ; though , in spite of every thing ; though , in quitting her almost the substance of mr. bennet and myself with a few struggles to derbyshire , and was very much awe and _UNK in his own , he was now anxious that her own opinion , relating the _UNK which might be always flying so much in her family ; and as they changed the occasion , in a _UNK _UNK of the season and two , tax in a letter from the _UNK of his meetings , instead of his meetings that her husband had not been spent in his own , he had always been ashamed of him , unless she could hardly see the _UNK , in spite that his manners was always so insufficient ; and , after return to her husband ' _UNK . '

2.4 Perplexity 

In [21]:
p = perplexity(test1,net2,vocab)
print('The perplexity for part 2.4 is', p)

The perplexity for part 2.4 is 250.72384241976118


2.5 Perplexity

In [22]:
p = perplexity(test2,net1,vocab)
print('The perplexity for part 2.5 is', p)

The perplexity for part 2.5 is 779.7476435290162


In [23]:
# words = []
# idx = 0
# word2idx = {}
# vectors = bcolz.carray(np.zeros(1), rootdir=f'/content/drive/MyDrive/Colab Notebooks/6B.100.dat', mode='w')

# with open(f'/content/drive/MyDrive/Colab Notebooks/glove.6B.100d.txt', 'rb') as f:
#     for l in f:
#         line = l.decode().split()
#         word = line[0]
#         words.append(word)
#         word2idx[word] = idx
#         idx += 1
#         vect = np.array(line[1:]).astype(np.float)
#         vectors.append(vect)
    
# vectors = bcolz.carray(vectors[1:].reshape((400000, 100)), rootdir=f'/content/drive/MyDrive/Colab Notebooks/6B.100.dat', mode='w')
# vectors.flush()
# pickle.dump(words, open(f'/content/drive/MyDrive/Colab Notebooks/6B.100_words.pkl', 'wb'))
# pickle.dump(word2idx, open(f'/content/drive/MyDrive/Colab Notebooks/6B.100_idx.pkl', 'wb'))

In [24]:
vectors = bcolz.open(f'/content/drive/MyDrive/Colab Notebooks/6B.100.dat')[:]
words = pickle.load(open(f'/content/drive/MyDrive/Colab Notebooks/6B.100_words.pkl', 'rb'))
word2idx = pickle.load(open(f'/content/drive/MyDrive/Colab Notebooks/6B.100_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [25]:
vocab2 = ['_PAD','_UNK']+ list(dics.keys())
word2idx = {o:i for i,o in enumerate(vocab2)}
idx2word = {i:o for i,o in enumerate(vocab2)}
matrix_len = len(vocab2)
weights_matrix = np.zeros((matrix_len, 100))
words_found = 0

for i, word in enumerate(vocab2):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100))

In [26]:
x_int,y_int = in_out_data(processed,word2idx,glove.keys())
net3 = WordLSTM(vocab2 ,pretrain = weights_matrix)
net3.cuda()
print(net3)
train(net3, batch_size = 32, epochs=20, print_every=256)

WordLSTM(
  (emb_layer): Embedding(6378, 100)
  (lstm): LSTM(100, 256, num_layers=4, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=6378, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [27]:
torch.save(net3.state_dict(),'/content/drive/MyDrive/Colab Notebooks/net3')

2.6 Output<br>
We can see that in the output some phrases repeat a lot, for example 'I am afraid' and 'she had been'.

In [28]:
for i in range(10):
  print(sample(net3))

<s> she had not the pleasure which had happened , to her mother and elizabeth was in a whisper , she could be sure , and she could have relished an hour , or her own feelings was not so ungovernable a match . " </s>
<s> " my dearest jane , " said jane . </s>
<s> she could not be so much , and the others of visiting into her own room . " " i am not afraid , i am afraid i am sure i have been to make him a very good young man of her sisters ; for , when she could be the case ; she had not yet been to the lakes . </s>
<s> " you must not give me the trouble which had done , but i shall be sure to get in her own private , sportive and consequence of his warmest sisters ; for elizabeth 's congratulations was to be in a whisper of her sisters . ' " said elizabeth ; but i was sure , and the pains you may be so much , she was not deceived with the day . " i was not afraid that he was to be a most anxious , and she could have hated you . " said mrs. bennet . ' i have no improper more than i am no

2.7 Perplexity

In [29]:
p = perplexity(test1,net3,vocab2)
print('The perplexity for part 2.7 is', p)

The perplexity for part 2.7 is 206.99343980793225


In [30]:
tweets = [i.split() for i in tweet]
tdics = Counter()
tdics.update(sum(tweets,[]))
vocab3 = {k:v for k,v in tdics.items() if v>1}
vocab3 = sorted(vocab3,key=vocab3.get, reverse=True)
vocab3 = ['_PAD','_UNK']+vocab3
word2idx = {o:i for i,o in enumerate(vocab3)}
idx2word = {i:o for i,o in enumerate(vocab3)}

In [31]:
x_int, y_int = in_out_data(tweets,word2idx,vocab3)
net4 = WordLSTM(vocab3)
net4.cuda()
print(net4)
train(net4, batch_size = 32, epochs=20, print_every=256)

WordLSTM(
  (emb_layer): Embedding(3997, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=3997, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [32]:
torch.save(net4.state_dict(),'/content/drive/MyDrive/Colab Notebooks/net4')

2.8 Output


In [33]:
for i in range(10):
  print(sample(net4))

<s> " i 'm so sad </s>
<s> i m not going back to work today . i miss my _UNK of my _UNK _UNK , but still a cutie ! ! </s>
<s> _UNK thanks for my house </s>
<s> " i hate the hills , but i have to get to sleep ! i m _UNK . _UNK _UNK . </s>
<s> _UNK thanks to _UNK . i m so tired and it is _UNK _UNK _UNK . " </s>
<s> i have a headache . " i 'm sorry . please keep _UNK in _UNK _UNK . </s>
<s> i have to be _UNK _UNK , i have to sleep in my house , and _UNK i 'm going a great arvo ! i have a _UNK _UNK _UNK _UNK , but it was like _UNK _UNK , i 'm going a good idea ! ! i m _UNK , but still going away to be _UNK , i have a _UNK _UNK . " _UNK , _UNK , etc , i 'm not feeling a bit of the last day of my life ! i m so tired . " i m _UNK . _UNK _UNK , etc , i 'm so hungry . </s>
<s> " </s>
<s> _UNK i do nt want a new background for my _UNK . " _UNK _UNK , _UNK , _UNK i 'm not going to get _UNK _UNK . _UNK _UNK . </s>
<s> " _UNK _UNK _UNK . i m sorry _UNK _UNK . </s>


2.9 Perplexity

In [34]:
p = perplexity(test2,net4,vocab3)
print('The perplexity for part 2.9 is', p)

The perplexity for part 2.9 is 196.16600697827522


In [35]:
x_int, y_int = in_out_data(tweets,word2idx,vocab3,seq_len=15)
net5 = WordLSTM(vocab3)
net5.cuda()
print(net5)
train(net5, batch_size = 32, epochs=20, print_every=256)

WordLSTM(
  (emb_layer): Embedding(3997, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=3997, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [36]:
torch.save(net5.state_dict(),'/content/drive/MyDrive/Colab Notebooks/net5')

2.10 Output

In [37]:
for i in range(10):
  print(sample(net5))

<s> _UNK _UNK i m a little bit at work at _UNK and _UNK . i 'm going to london my driver to go insane . </s>
<s> " _UNK _UNK i 'm so hungry . </s>
<s> " i 'm so hungry but i m a _UNK , but do n't wanna eat more ... but i 'm staying to sleep early . i 'm not a _UNK , and now it is not 1 - the _UNK _UNK , but i 'm staying to see my lakers . just _UNK _UNK _UNK . </s>
<s> i have to be at the office . </s>
<s> i 'm not a _UNK </s>
<s> _UNK _UNK i have _UNK _UNK </s>
<s> _UNK it was n't _UNK </s>
<s> _UNK it is _UNK </s>
<s> i m a pro day at _UNK _UNK </s>
<s> i m going 2 bed again ! ! ! i m not _UNK ! ! </s>


2.11 Perplexity

In [38]:
p = perplexity(test2,net5,vocab3)
print('The perplexity for part 2.11 is', p)

The perplexity for part 2.11 is 213.8682215870541


2.12 Perplexity

In [39]:
p = perplexity(test1,net4,vocab3)
print('The perplexity for part 2.12 is', p)

The perplexity for part 2.12 is 349.493321909592


In [40]:
vocab4 = ['_PAD','_UNK']+ list(tdics.keys())
word2idx = {o:i for i,o in enumerate(vocab4)}
idx2word = {i:o for i,o in enumerate(vocab4)}
weights_matrix = np.zeros((len(vocab4), 100))
for i, word in enumerate(vocab4):
    try: 
        weights_matrix[i] = glove[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100))

In [41]:
x_int,y_int = in_out_data(tweets,word2idx,glove.keys())
net6 = WordLSTM(vocab4 ,pretrain = weights_matrix)
net6.cuda()
print(net6)
train(net6, batch_size = 32, epochs=20, print_every=256)

WordLSTM(
  (emb_layer): Embedding(12632, 100)
  (lstm): LSTM(100, 256, num_layers=4, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=12632, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [42]:
torch.save(net6.state_dict(),'/content/drive/MyDrive/Colab Notebooks/net6')

2.13 Output

In [43]:
for i in range(10):
  print(sample(net6))

<s> i have n't seen it to sleep ! </s>
<s> i m going back back to school today </s>
<s> " amp i have a headache . </s>
<s> " _UNK _UNK ; _UNK i have n't been a bit day , but it is the only of my soul one . </s>
<s> " _UNK it is a bit day and _UNK . i m so hot ! </s>
<s> " i m so bummed i 'm not sure i have to go back back home , i ca nt sleep , i ca n't get to go out . but i ca nt sleep , but i 'm going a lot cleanser and locked . but i do not get to see my beaty - sort & " </s>
<s> " amp _UNK _UNK - no one of me _UNK i have n't seen it . " i 'm so bored ! i have to go to work . </s>
<s> _UNK _ _ _UNK ) ) ) ) ) _UNK ; i have n't seen my _UNK ; _UNK _UNK _UNK i m not going out in the morning . " _UNK well , i 'm so bored ! i m so sad ! </s>
<s> i have to be a long note _UNK i 'm not going to sleep , and its not feeling going to sleep . </s>
<s> " i m going a _UNK i have a headache , but still a lot of the life . " i m so sad ! ! i 'm so tired , but it 's not good ! i 'm a closet , but i

2.14 Perplexity

In [44]:
p = perplexity(test2,net6,vocab4)
print('The perplexity for part 2.14 is', p)

The perplexity for part 2.14 is 475.52951618365313


In [45]:
# words = []
# idx = 0
# word2idx = {}
# vectors = bcolz.carray(np.zeros(1), rootdir=f'/content/drive/MyDrive/Colab Notebooks/27B.100.dat', mode='w')

# with open(f'/content/drive/MyDrive/Colab Notebooks/glove.twitter.27B.100d.txt', 'rb') as f:
#     for l in f:
#       line = l.decode().split()
#       if idx == 38522:
#         word = l.decode().split(' ')[0]
#         vect = np.array(line).astype(np.float)
#       else:
#         word = line[0]
#         vect = np.array(line[1:]).astype(np.float)
#       words.append(word)
#       word2idx[word] = idx
#       idx += 1
#       vectors.append(vect)
    
# vectors = bcolz.carray(vectors[1:].reshape((idx, 100)), rootdir=f'/content/drive/MyDrive/Colab Notebooks/27B.100.dat', mode='w')
# vectors.flush()
# pickle.dump(words, open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_words.pkl', 'wb'))
# pickle.dump(word2idx, open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_idx.pkl', 'wb'))

In [46]:
vectors = bcolz.open(f'/content/drive/MyDrive/Colab Notebooks/27B.100.dat')[:]
words = pickle.load(open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_words.pkl', 'rb'))
word2idx = pickle.load(open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_idx.pkl', 'rb'))

twitterglove = {w: vectors[word2idx[w]] for w in words}

In [47]:
vocab5 = ['_PAD','_UNK']+ list(tdics.keys())
word2idx = {o:i for i,o in enumerate(vocab5)}
idx2word = {i:o for i,o in enumerate(vocab5)}
matrix_len = len(vocab5)
weights_matrix = np.zeros((matrix_len, 100))
words_found = 0

for i, word in enumerate(vocab5):
    try: 
        weights_matrix[i] = twitterglove[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100))

In [48]:
x_int,y_int = in_out_data(tweets,word2idx,twitterglove.keys())
net7 = WordLSTM(vocab5 ,pretrain = weights_matrix)
net7.cuda()
print(net7)
train(net7, batch_size = 32, epochs=20, print_every=256)

WordLSTM(
  (emb_layer): Embedding(12632, 100)
  (lstm): LSTM(100, 256, num_layers=4, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=12632, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [49]:
torch.save(net7.state_dict(),'/content/drive/MyDrive/Colab Notebooks/net7')

2.15 Output

In [50]:
for i in range(10):
  print(sample(net7))

<s> i have n't been sleeping in my house . i 'm not so hungry . </s>
<s> _UNK _UNK _UNK " </s>
<s> " </s>
<s> i have a doctors _UNK . i 'm not feeling _UNK , i have to get a _UNK _UNK i m sad . " _UNK hey nathan . </s>
<s> _UNK i have n't been a long day _UNK </s>
<s> i m not _UNK _UNK _UNK _UNK i m sad . i m so sad . i 'm going _UNK i 'm not looking forward on a bunch of my house . " i 'm going to sleep tomorrow . </s>
<s> i have to be _UNK % over , but still _UNK _UNK i have n't seen the hills . </s>
<s> _UNK i m sorry for a great note _UNK i 'm sorry i 'm not a closet drf bar in my morning . i m broke to do resin - asti </s>
<s> " _UNK _UNK , i 'm not feeling _UNK % _UNK _UNK _UNK i 'm going on my state , i 'm gon na be _UNK i have to be _UNK % _UNK , _UNK i m stacey is a lot ! </s>
<s> _UNK i m sorry for the life , and _UNK , i 'm sorry i have n't quit a lot of the _UNK _UNK ; _UNK i have n't seen my nokia deflating , and it 's not a long note , i have to do wid ! ! ! ! </s>


2.16 Perplexity

In [51]:
p = perplexity(test2,net7,vocab5)
print('The perplexity for part 2.16 is', p)

The perplexity for part 2.16 is 585.9613341401835
