In [None]:
# Game of Thrones NLG Model

In [1]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

In [2]:
import pickle
import re
import numpy as np
import torch.nn as nn
import torch
import torch.nn.functional as F
import random

In [4]:
# read pickle file
pickle_in = open("GameOfThrones_list_2","rb")
movie_plots = pickle.load(pickle_in)

# count of movie plot summaries
len(movie_plots)

1

In [5]:
movie_plots



In [6]:
movie_plots[0]



In [7]:
def get_fixed_sequence(text, seq_len = 5):
  sequences = []
  words = text.split()
  if len(words) > seq_len:
    for i in range(seq_len, len(words)):
      seq_list = words[i-seq_len: i]
      sequences.append(" ".join(seq_list))
  else:
    sequences = words
  return sequences

In [8]:
seqs = [get_fixed_sequence(plot) for plot in movie_plots]

In [9]:
len(seqs)

1

In [10]:
seqs = sum(seqs, [])

In [11]:
seqs[1]

'comets tail spread across the'

In [12]:
x = []
y = []
for seq in seqs:
  words = seq.split()
  x.append(" ".join(words[:-1]))
  y.append(" ".join(words[1:]))

In [13]:
x[0], y[0]

('The comets tail spread', 'comets tail spread across')

In [14]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in set(" ".join(movie_plots).split()):
  int2token[cnt] = w
  cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

token2int["the"], int2token[71]

(2216, 'swordshort')

In [15]:
# token2int["Murderer"], int2token[71]

In [None]:
token2int

In [16]:
# set vocabulary size
vocab_size = len(int2token)
vocab_size

16043

In [17]:
def get_integer_seq(seq):
  return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

In [18]:
x_int[0]

array([ 6891, 14597,  9107, 14294])

In [19]:
def get_batches(arr_x, arr_y, batch_size):
  prev = 0
  for n in range(batch_size, arr_x.shape[0], batch_size):
    x = arr_x[prev:n]
    y = arr_y[prev:n]
    prev = n
    yield x,y

In [20]:
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [21]:
# instantiate the model
net = WordLSTM()

# push the model to GPU (avoid it if you are not using the GPU)
net.cuda()

print(net)


WordLSTM(
  (emb_layer): Embedding(16043, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=16043, bias=True)
)


In [22]:
# Initialize optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Print model's state_dict
print("Model's state_dict:")
for param_tensor in net.state_dict():
    print(param_tensor, "\t", net.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
emb_layer.weight 	 torch.Size([16043, 200])
lstm.weight_ih_l0 	 torch.Size([1024, 200])
lstm.weight_hh_l0 	 torch.Size([1024, 256])
lstm.bias_ih_l0 	 torch.Size([1024])
lstm.bias_hh_l0 	 torch.Size([1024])
lstm.weight_ih_l1 	 torch.Size([1024, 256])
lstm.weight_hh_l1 	 torch.Size([1024, 256])
lstm.bias_ih_l1 	 torch.Size([1024])
lstm.bias_hh_l1 	 torch.Size([1024])
lstm.weight_ih_l2 	 torch.Size([1024, 256])
lstm.weight_hh_l2 	 torch.Size([1024, 256])
lstm.bias_ih_l2 	 torch.Size([1024])
lstm.bias_hh_l2 	 torch.Size([1024])
lstm.weight_ih_l3 	 torch.Size([1024, 256])
lstm.weight_hh_l3 	 torch.Size([1024, 256])
lstm.bias_ih_l3 	 torch.Size([1024])
lstm.bias_hh_l3 	 torch.Size([1024])
fc.weight 	 torch.Size([16043, 256])
fc.bias 	 torch.Size([16043])
Optimizer's state_dict:
state 	 {}
param_groups 	 [{'lr': 0.001, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]}]


In [23]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to GPU
    net.cuda()
    
    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter))

In [24]:
train(net, batch_size = 32, epochs=20, print_every=256)

Epoch: 1/20... Step: 256...
Epoch: 1/20... Step: 512...
Epoch: 1/20... Step: 768...
Epoch: 1/20... Step: 1024...
Epoch: 1/20... Step: 1280...
Epoch: 1/20... Step: 1536...
Epoch: 1/20... Step: 1792...
Epoch: 1/20... Step: 2048...
Epoch: 1/20... Step: 2304...
Epoch: 1/20... Step: 2560...
Epoch: 1/20... Step: 2816...
Epoch: 1/20... Step: 3072...
Epoch: 1/20... Step: 3328...
Epoch: 1/20... Step: 3584...
Epoch: 1/20... Step: 3840...
Epoch: 1/20... Step: 4096...
Epoch: 1/20... Step: 4352...
Epoch: 1/20... Step: 4608...
Epoch: 1/20... Step: 4864...
Epoch: 1/20... Step: 5120...
Epoch: 1/20... Step: 5376...
Epoch: 1/20... Step: 5632...
Epoch: 1/20... Step: 5888...
Epoch: 1/20... Step: 6144...
Epoch: 1/20... Step: 6400...
Epoch: 1/20... Step: 6656...
Epoch: 1/20... Step: 6912...
Epoch: 1/20... Step: 7168...
Epoch: 1/20... Step: 7424...
Epoch: 1/20... Step: 7680...
Epoch: 1/20... Step: 7936...
Epoch: 1/20... Step: 8192...
Epoch: 1/20... Step: 8448...
Epoch: 1/20... Step: 8704...
Epoch: 1/20... St

In [25]:
def predict(net, tkn, h=None):
         
  # tensor inputs
  x = np.array([[token2int[tkn]]])
  inputs = torch.from_numpy(x)
  
  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  out, h = net(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return int2token[sampled_token_index], h



In [28]:

# function to generate text
def sample(net, size, prime='The'):
        
    # push to GPU
    net.cuda()
    
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h = predict(net, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [29]:
sample(net, 10)

'The queen had been taken a boy of thirteen and a'

In [None]:
sample(net, 5, prime = "they")

'they are a man to the'

In [30]:
# Specify a path
PATH = "GTO_model.pt"

# Save the model
torch.save(net, PATH)

In [None]:

# # save the model to disk
# # filename = 'top30_genre_model.pickle'
# filename = 'GOT_model.pickle'
# pickle.dump(net, open(filename, 'wb'))


In [None]:
# **** RANDOM WORDS ****

In [None]:
# ***** Remove common words from word bank *****

In [None]:
# read pickle file - most common english words
pickle_in = open("common_words.pkl","rb")
common_words = pickle.load(pickle_in)

# count of movie plot summaries
len(common_words)

3000

In [None]:
common_words

In [None]:
from collections import Counter

In [None]:
# Create list of all the words in the string
movie_list = movie_plots[0].split()

In [None]:
movie_list

In [None]:
# new_movie_list = filter(lambda i: i not in common_words, movie_list)

In [None]:
new_movie_list = [i for i in movie_list if i not in common_words]

In [None]:
new_movie_list

In [None]:
# Get the count of each word.
word_count = Counter(new_movie_list)

# Use most_common() method from Counter subclass
print(word_count.most_common(20))

[('was', 3930), ('had', 2821), ('The', 2576), ('He', 1668), ('said', 1454), ('is', 1414), ('were', 1209), ('Ser', 973), ('Lord', 945), ('She', 826), ('are', 757), ('A', 745), ('been', 688), ('It', 672), ('You', 653), ('did', 648), ('men', 645), ('Tyrion', 633), ('And', 582), ('They', 526)]


In [None]:
# **** Option 1: Get random words for word bank without removing common English words ****
random_tokens = random.sample(list(token2int), 50)
random_tokens

['displeasure',
 'RHYMER',
 'Gasps',
 'punished',
 'tremulous',
 'balled',
 'watered',
 'Graces',
 'aft',
 'afaronce',
 'Throw',
 'strengthened',
 'bridegrooms',
 'villainy',
 'braid',
 'likeness',
 'foamed',
 'childrens',
 'Musicians',
 'protects',
 'Shaggys',
 'pirates',
 'goodwill',
 'bloodiest',
 'housed',
 'leeches',
 'results',
 'sausage',
 'Below',
 'dish',
 'whiteeyed',
 'commands',
 'escape',
 'accents',
 'fireflies',
 'gutchurning',
 'closer',
 'aswarm',
 'leavings',
 'Always',
 'dissemble',
 'entry',
 'whoosh',
 'Towers',
 'measure',
 'follower',
 'Choose',
 'Boros',
 'throng',
 'ser']

In [None]:
# *** Option2: Get random words from list of words that doesn't include most common English words ***

In [None]:
new_movie_list2 = [i for i in token2int if i not in common_words]

In [None]:
# Get random words for word bank
random_tokens2 = random.sample(list(new_movie_list2), 50)
random_tokens2

['litter',
 'monsters',
 'Understanding',
 'byrnie',
 'wines',
 'Eye',
 'Vermillion',
 'swordfight',
 'Eleven',
 'targets',
 'Camps',
 'Qalen',
 'cartwheel',
 'poacher',
 'rote',
 'worshipers',
 'revenues',
 'DONELLA',
 'invincible',
 'throbbing',
 'complaints',
 'older',
 'concur',
 'assuredly',
 'tattered',
 'envelop',
 'feeble',
 'lesser',
 'Roots',
 'Todrics',
 'elm',
 'inks',
 'accustomed',
 'BRUNE',
 'dancing',
 'whitefish',
 'merwives',
 'vassals',
 'knots',
 'squalid',
 'IN',
 'Artos',
 'fettered',
 'quiescent',
 'drifted',
 'echoing',
 'baggage',
 'gibbering',
 'beguile',
 'Fewer']