In [1]:
import pandas as pd
import numpy as np
import nltk.tokenize
import itertools
import datetime
import torch

from pathlib import Path

from fastai import *
from fastai.text import *

from copy import copy, deepcopy
from enum import Enum

# Deep Lyrics Generator - ULMFiT

## Set up instructions

### Create VM Instance

- Go to cloud.google.com, and create a new VM instance
- Disk size: 100GB or more
- CPUs + Memory: 2vCPUs, 7.5 GB Memory
- GPU: K80 (cheaper, less power) or P100 (2.5x more expensive, more power)
- Enable http, https traffic
- Boot: Deep learning pytorch instance

### Network configuration

In Google cloud platform:

- Go to Networking -> VPC Network, External IP addresses
- Select your VM instance and change the external address type from Ephemeral to Static
- Go to Networking -> VPC Network, Firewall Rules
- Add a new Rule, called Jupyter, ip ranges 0.0.0.0/0, protocols and ports tcp:8888, apply to all targets

### VM + Jupyter Setup

- SSH to VM
- Enlist into Github repo
- Run src/setup.sh
- Run jupyter notebook
- Open a google cloud shell
- Run gcloud init and answer the questions
- To set up a tunnel and run jupyter locally, run ```gcloud compute --project "<your project>" ssh --zone "<your zone>" "<your instance name>" -- -L 8888:localhost:8888```
- Open jupyter notebook in your local computer and have fun

### Notebook first run
Here are some steps to run the first time you use the notebook.

#### Tokens
To create the model's tokens with the correct train-test split, run ```src/data_collection/lm_data_lyrics.py -o path/to/save```. 
We recommend saving in data/models/{MODEL_NAME}. Alternatively, run the magic command below and replace the model name.

In [2]:
%run ../src/data_collection/lm_data_lyrics.py -o ../data/models/3.1-ULMFiT-108k

Numericalizing train.
Numericalizing valid.


## Load Data

Now that we've created the tokens, let's load them into a `DataBunch` to train our LM further or generate text with a pre-trained LM.

In [3]:
model_name = '3.1-ULMFiT-108k'
MODEL_PATH = Path(f'../data/models/{model_name}')
MODEL_PATH.mkdir(exist_ok=True)

In [4]:
data_lm = TextLMDataBunch.from_tokens(MODEL_PATH,
                                      bs=128,
                                      max_vocab=10000)

print(data_lm.train_ds.vocab_size)

10002


## Model setup

In [5]:
GPU = True

In [8]:
learn = RNNLearner.language_model(data_lm,
                                  pretrained_model=URLs.IMDB,
                                  drop_mult=0.5)

save_callback = SaveModel(learn, model_name='ULMFiT_3.0-108k')

In [9]:
DOWNLOAD_MODEL_WEIGHTS = True
weights_url = 'https://storage.googleapis.com/w210-capstone/models/ULMFiT_3.0-108k_best.pth'

if DOWNLOAD_MODEL_WEIGHTS:
    Path(MODEL_PATH/'models').mkdir(exist_ok=True)
    download_url(weights_url, MODEL_PATH/f'models/{model_name}_best.pth', overwrite=True)

In [10]:
def cpu_load(self, name:PathOrStr):
    """Load model onto CPU that was trained on a GPU `name` from `self.model_dir`.
       We need these because the fastai load function doesn't allow for a remapping of the storage location."""
    self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=lambda storage, loc: storage))

setattr(RNNLearner, 'cpu_load', cpu_load) #monkey patch onto our RNNLearner

In [11]:
if not GPU:
    learn.cpu_load(f'{model_name}_best')
else:
    learn.load(f'{model_name}_best')

## Training

In [12]:
TRAIN = False

In [13]:
@dataclass
class SaveModel(LearnerCallback):
    """Save Latest Model"""
    def __init__(self, learn:Learner, model_name='saved_model'):
        super().__init__(learn)
        self.model_name = model_name
        self.model_date = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        self.best_loss = None
        self.perplexity = []
        
    def on_epoch_end(self, epoch:int, metrics, last_metrics, **kwargs):
        loss, *_ = last_metrics
        perp = np.exp(loss)
        self.perplexity.append(perp)
        if self.best_loss == None or loss < self.best_loss:
            self.best_loss = loss
            self.learn.save(f'{self.model_name}_best')
        return False
    
    def on_train_end(self, epoch:int, **kwargs):
        self.learn.save(f'{self.model_name}_last')

In [14]:
save_callback = SaveModel(learn, model_name=f'{model_name}')

In [15]:
if TRAIN:
    learn.fit_one_cycle(1, 1e-2)

In [16]:
if TRAIN:
    learn.unfreeze()
    learn.fit(10, 1e-3, callbacks=[save_callback])

In [17]:
print("best validation loss: ", learn.save_model.best_loss)

best validation loss:  None


#### Learning Loss

In [18]:
if TRAIN:
    learn.recorder.plot_losses()

## Model Load

## Text Generation

In [58]:
def generate_step(learner, context, context_length):

    model = learner.model
    
    if GPU:
        context = LongTensor(context[-context_length:]).view(-1,1).cuda()
    else:
        context = LongTensor(context[-context_length:]).view(-1,1).cpu()
    
    context = torch.autograd.Variable(context)
    
    model.reset()
    model.eval()
    #print('seq model', context)
    # forward pass the "context" into the model
    result, *_ = model(context)
    result = result[-1]
    #print (result, len(result))
    # set unk and pad to 0 prob
    # i.e. never pick unknown or pad
    result[0] = -np.inf
    result[1] = -np.inf

    # softmax and normalize
    probabilities = F.softmax(result, dim=0)
    probabilities = np.asarray(probabilities.detach().cpu(), dtype=np.float)
    probabilities /= np.sum(probabilities) 
    return probabilities

def print_words(sequence):
    for i in range(len(sequence[0])):
        
        step = sequence[0][i]

        word = data_lm.valid_ds.vocab.textify([step])

        if word == 'xeol':
            word = '\n'
        elif 'xbol' in word:
            continue
        elif word == 'xeos': 
            print(word)
            break
            
        print(word, end=' ')   

def generate_text(learner, seed_text=['xbos'], max_len=500, GPU=False, context_length=20):
    """Generates text with a given learner and prints string to console.

    Parameters
    ----------
    learner : RNNLearner Language Model (RNNLearner.language_model())
        Fastai RNNLearner with tokenized language model data already loaded 
        
    seed_text : list or str
        List of strings where each item is a token. (e.g. ['the', 'cat']) or string that is split on white space

    max_len : int
        Number of words in generated sequence
        
    gpu : bool
        If you're using a GPU or not...
    
    context_length : int
        Amount of words that get input as "context" into the model. Set to 0 for no limit

    Returns
    -------
    None : NoneType
        Doesn't return anything, prints string to console
    """
        
    if isinstance(seed_text, str):
        seed_text = data_lm.train_ds.vocab.numericalize(seed_text.split(' '))
    
    
    # Width for the beam search, to be externalized along with general decoding
    beam_width = 5

    # List of candidate word sequence. We'll maintain #beam_width top sequences here.
    # The context is a list of words, the score is the multiplied probabilities of each word
    sequences = [[seed_text, 0.0]]
    
    # Loop over max number of words
    for _ in range(max_len):
        
        print ('Generating word: ', _, '/', max_len)
        candidates = list()
        
        # For each top sequence, generate the next word, and pick #beam_width candidates
        for i in range(len(sequences)):
            
            # Get a new sequence of word indices and log-probability
            # Example: [[2, 138, 661], 23.181717]
            words, score = sequences[i]

            # Obtain probabilities for next word given the context 
            probabilities = generate_step(learner, words, context_length)

            # Multinomial draw from the probabilities
            draw = np.random.multinomial(1, probabilities)
            next_word_idx = np.argsort(draw)[::-1][0]
            
            words.append(next_word_idx) 

            candidate = [words, (score - log(probabilities[next_word_idx]))]
            candidates.append(candidate)

        # Once we have the candidate words for each top sequence, do a multinomial draw based off the score to pick the top
        # Greedy version would be to pick top N scored sequences
        probs = [candidate[1] for candidate in candidates]

        # Out of all the candidates, select in a greedy way the top # beam_width to limit the breadth
        # of the optimization tree
        top = np.argsort(probs)[:beam_width]
        top_candidates = list()
        for r in range(len(top)):
            top_candidates.append(candidates[top[r]])
        
        sequences = top_candidates
        
    print_words(sequences[0])


In [59]:
generate_text(learn, GPU=GPU, seed_text='xbos xbol [verse-1] xeol xbol the answer is', max_len=200, context_length=50)

Generating word:  0 / 200
candidates 1
Generating word:  1 / 200
candidates 1
Generating word:  2 / 200
candidates 1
Generating word:  3 / 200
candidates 1
Generating word:  4 / 200
candidates 1
Generating word:  5 / 200
candidates 1
Generating word:  6 / 200
candidates 1
Generating word:  7 / 200
candidates 1
Generating word:  8 / 200
candidates 1
Generating word:  9 / 200
candidates 1
Generating word:  10 / 200
candidates 1
Generating word:  11 / 200
candidates 1
Generating word:  12 / 200
candidates 1
Generating word:  13 / 200
candidates 1
Generating word:  14 / 200
candidates 1
Generating word:  15 / 200
candidates 1
Generating word:  16 / 200
candidates 1
Generating word:  17 / 200
candidates 1
Generating word:  18 / 200
candidates 1
Generating word:  19 / 200
candidates 1
Generating word:  20 / 200
candidates 1
Generating word:  21 / 200
candidates 1
Generating word:  22 / 200
candidates 1
Generating word:  23 / 200
candidates 1
Generating word:  24 / 200
candidates 1
Generating

In [60]:
x = [1, 2, 3, 4, 0]
print(np.argsort(x))
print(np.argsort(x)[:3])
x2 = [0.3, 0.6, 0.1]
draw = np.random.multinomial(1, x2)
print (draw)
print(np.argsort(draw)[::-1][0])
#torch.multinomial(torch.tensor(x2), 1)[0]


[4 0 1 2 3]
[4 0 1]
[1 0 0]
0


In [23]:
exp(0.7)

2.0137527074704766

In [21]:
x = set()
x.add(1)
x.add(1)
x.add(2)
print (len(x))

2
