In [81]:
import re
import pickle
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [82]:
pickle_in = open("./data/plots_text.pickle", "rb")
movie_plots = pickle.load(pickle_in)

movie_plots = [re.sub("[^a-z' ]", "", i) for i in movie_plots]
random.sample(movie_plots, 5)

["aloyisius t mckeever  a new york city hobo makes his home in a boardedup fifth avenue mansion using the back door while its owner multimillionaire  michael j o'connor  winters in the south mckeever winds up taking in homeless exgi jim bullock  who has been evicted from an apartment building o'connor is tearing down for a new skyscraper and runaway yearold trudy smith  who unknown to him is o'connor's daughter soon jim invites war buddies whitey  hank  and their families to share the vast mansion while they seek permanent homes of their own trudy falls in love with jim and when her father demands to meet him convinces o'connor to also take up residence pretending to be the panhandler mike to win jim's love without the temptation of marrying her for her wealth mckeever allows mike to move in but treats him as a servant when mike warns trudy that he intends to have them all arrested for criminal trespass she persuades her mother mary  to fly up from florida and pretend to be the th inte

In [83]:
def create_seq(text, seq_len=5):
    sequences = []
    
    if len(text.split()) > seq_len:
        for i in range(seq_len, len(text.split())):
            seq = text.split()[i-seq_len: i+1]
            sequences.append(" ".join(seq))
    
        return sequences
    else:
        return [text]

seqs = [create_seq(p) for p in movie_plots]
seqs = sum(seqs, [])
print("dataset length: ",len(seqs))

dataset length:  152644


In [84]:
x = []
y = []

for s in seqs:
    word_list = s.split()
    x.append(" ".join(word_list[:-1]))
    y.append(" ".join(word_list[1:]))

In [85]:
int2token = {}
cnt = 0

vocab = set(" ".join(movie_plots).split())
for w in vocab:
    int2token[cnt] = w
    cnt += 1

token2int = {t: i for i, t in int2token.items()}
vocab_size = len(vocab)
int2token[token2int['private']]

'private'

In [86]:
encode = lambda string: [token2int[w] for w in string.split()]

x_int = [encode(i) for i in x]
y_int = [encode(i) for i in y]

x_int = np.array(x_int)
y_int = np.array(y_int)

def get_batches(x, y, batch_size):
    prv = 0
    for n in range(batch_size, x.shape[0], batch_size):
        output_x = x[prv:n,:]
        output_y = y[prv:n,:]
        prv = n
        yield output_x, output_y 
        

In [94]:

class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()
        
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)
        self.lstm = nn.LSTM(200, n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.drop_out = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, x, hidden):
        """forward pass of the network
        input through embedding layer (200) => feed init hidden state and input to lstm layer 
        => get output and new hidden => reshape output after drop out and feed to fully connected layer

        Args:
            x (_type_): _description_
            hidden (_type_): _description_
        """
        embedded = self.emb_layer(x)
        lstm_output, hidden = self.lstm(embedded, hidden)
        out = self.drop_out(lstm_output)
        out = out.reshape(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))

        return hidden

In [95]:
net = WordLSTM()
net.to(device)
print(net)

WordLSTM(
  (emb_layer): Embedding(16592, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (drop_out): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=16592, bias=True)
)


In [112]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    net.to(device)
    
    counter = 0
    net.train()
    total_loss=0
    for e in range(epochs):
        hidden = net.init_hidden(batch_size)
        for x, y in get_batches(x_int, y_int, batch_size):
            counter += 1
            
            inputs, targets = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device)
            hidden = tuple([each.data for each in hidden])
            
            net.zero_grad()
            
            output, hidden = net(inputs, hidden)
            loss = criterion(output, targets.view(-1).long())
            total_loss += loss.item()
            loss.backward()
            
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
        print("Epoch: {}/ {} ".format(e+1, epochs), "Loss: {:.4f}".format(total_loss/counter))
        total_loss = 0
        counter = 0

In [113]:
train(net)

Epoch: 1/ 10  Loss: 7.0018
Epoch: 2/ 10  Loss: 6.2330
Epoch: 3/ 10  Loss: 5.7888
Epoch: 4/ 10  Loss: 5.4962
Epoch: 5/ 10  Loss: 5.2948
Epoch: 6/ 10  Loss: 5.1359
Epoch: 7/ 10  Loss: 5.0043
Epoch: 8/ 10  Loss: 4.8875
Epoch: 9/ 10  Loss: 4.7953
Epoch: 10/ 10  Loss: 4.7203


In [132]:
def predict(net, tkn, h=None):
    x = np.array([[token2int[tkn]]])
    inputs = torch.from_numpy(x).to(device)
    
    h = tuple([each.data for each in h])
    
    out, h = net(inputs, h)
    p = F.softmax(out, dim=1).data
    p = p.cpu()
    p = p.numpy()
    p = p.reshape(p.shape[1],)
    
    top_n_idx = p.argsort()[-3:][::-1]

    # randomly select one of the three indices
    sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

    # return the encoded value of the predicted char and the hidden state
    return int2token[sampled_token_index], h

def sample(net, size, prime=''):
        
    # push to GPU
    net.cuda()
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
        token, h = predict(net, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [144]:
sample(net, 20, 'i')

'i not be a vampire but is a cheat but she is aware of his grade and the us monster and'

: 