In [1]:
import pandas as pd
import numpy as np
import nltk.tokenize
import itertools

from pathlib import Path

from fastai import *
from fastai.text import *

## Read in Data

In [2]:
df = pd.read_csv('https://storage.googleapis.com/w210-capstone/data/lyrics-valid.csv', header=None, escapechar='\\', names=['msd_id', 'lyrics'])
# drop lyrics >5000
df = df[df.lyrics.str.len() < 5000]
df.head()

Unnamed: 0,msd_id,lyrics
0,TRAADJU128F92F58E1,I hear you praying with your hands clasped ove...
1,TRAADQX128F422B4CF,If you ever make it back to Nashville\nRemembe...
2,TRAAFTE128F429545F,Just when I thought I was safe\nYou found me i...
3,TRAAKAG128F4275D2A,Paroles de la chanson Sultao Das Matas :\nSult...
4,TRAAMRO128F92F20D7,From What You Whispered\n........................


## Tokenization

1. First consider each line its own "sentence", keeping track of blanklines
2. Regexp Tokenizer with the following:  
 - Bracket enclosed texts (usually song part header)
 - All words
 - Any numeric -- keep commas and periods together
 - All other non-whitespace characters
3. Wrap each line with `<s>` and `</s>` tokens
4. Wrap each song with `<d>` and `</d>` tokens (documents)

In [3]:
def tokenize_lyrics(lyrics):
    tk = nltk.tokenize.LineTokenizer(blanklines='keep')
    tokd = tk.tokenize(lyrics)
    
    re_tk = nltk.tokenize.RegexpTokenizer(r'\[[^\]]+\]|\w+|[\d\.,]+|\S+',
                                          discard_empty=False)
    re_tokd = re_tk.tokenize_sents(tokd)
    
    [s.insert(0, f'xBOL') for s in re_tokd] # insert start token for each line
    [s.append('xEOL') for s in re_tokd] # append end token for each line
    
    flat = list(itertools.chain(*re_tokd))
    flat.insert(0, 'xBOS')
    flat.append('xEOS')
    # lower case and de-space
    flat = [w.lower().replace(' ', '-') for w in flat]
    return flat

In [4]:
df['tokd'] = df.lyrics.apply(tokenize_lyrics)
df['tokd_len'] = df.tokd.apply(len)
df.head()

Unnamed: 0,msd_id,lyrics,tokd,tokd_len
0,TRAADJU128F92F58E1,I hear you praying with your hands clasped ove...,"[xbos, xbol, i, hear, you, praying, with, your...",215
1,TRAADQX128F422B4CF,If you ever make it back to Nashville\nRemembe...,"[xbos, xbol, if, you, ever, make, it, back, to...",196
2,TRAAFTE128F429545F,Just when I thought I was safe\nYou found me i...,"[xbos, xbol, just, when, i, thought, i, was, s...",186
3,TRAAKAG128F4275D2A,Paroles de la chanson Sultao Das Matas :\nSult...,"[xbos, xbol, paroles, de, la, chanson, sultao,...",58
4,TRAAMRO128F92F20D7,From What You Whispered\n........................,"[xbos, xbol, from, what, you, whispered, xeol,...",310


For now, just save both as both train and valid

## Model file prep

In general, you need train tokens, validation tokens, an int-to-string (itos) mapping, and your state dict (i.e. weights, parameters, .pth file; these three are all the same thing)

In [6]:
SAVE_TOKENS = False

In [5]:
model_title = '2.0-ULMFiT'
MODEL_PATH = Path(f'../data/models/{model_title}')
MODEL_PATH.mkdir(exist_ok=True)

In [7]:
if SAVE_TOKENS:
    tokens = np.array(df.tokd)
    
    np.save(MODEL_PATH/'train_tok.npy', tokens)
    np.save(MODEL_PATH/'valid_tok.npy', tokens)

## ULMFiT Language Model

In [8]:
DOWNLOAD_WIKITEXT = False
DOWNLOAD_PRETRAINED_LYRICS = True

In [9]:
if DOWNLOAD_WIKITEXT:
    url = 'http://files.fast.ai/models/wt103_v1/'
    download_url(f'{url}lstm_wt103.pth', MODEL_PATH/'models/lstm_wt103.pth')
    download_url(f'{url}itos_wt103.pkl', MODEL_PATH/'models/itos_wt103.pkl')
    
if DOWNLOAD_PRETRAINED_LYRICS:
    url = 'https://storage.googleapis.com/capstone-deep-lyrics/'
    download_url(f'{url}2.0-ULMFiT.pth', MODEL_PATH/'models/2.0-ULMFiT.pth')
    download_url(f'{url}itos.pkl', MODEL_PATH/'models/itos.pkl')

In [10]:
data_lm = TextLMDataBunch.from_tokens(MODEL_PATH,
                                      bs=128,
                                      max_vocab=10000)

print(data_lm.train_ds.vocab_size)

10002


In [11]:
x,y = next(iter(data_lm.train_dl))
example = x[:20,:10].cpu()
texts = pd.DataFrame([data_lm.train_ds.vocab.textify(l).split(' ') for l in example])
print(x.shape, y.shape)
texts

torch.Size([95, 128]) torch.Size([12160])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,xbos,xeol,'s,[outro],will,xeol,so,xbol,won,my
1,xbol,xbol,sad,xeol,destroy,xbol,i,i,'t,soul
2,i,i,but,xbol,her,i,fight,said,deal,xeol
3,hear,'ll,you,i,xeol,'ve,xeol,",",xeol,xeos
4,you,do,seem,'ll,xbol,been,xbol,it,xbol,xbos
5,praying,anything,better,never,so,takin,i,'s,i,xbol
6,with,for,when,know,i,',cannot,my,won,xxunk
7,your,you,i,xeol,'m,all,shake,life,'t,xxunk
8,hands,",",'m,xbol,giving,the,from,xeol,change,em
9,xxunk,show,gone,i,up,blame,my,xbol,",",busca


## Learner instantiation

Set a few basic training/saving parameters here:

In [12]:
TRAIN = False
SAVE = False
GPU = False

Load ULMFiT Model architecture and create and embedding matrix that includes the new words. The new words are initialized to the mean value of all prior vocab...

In [13]:
learn = RNNLearner.language_model(data_lm,
                                  pretrained_fnames=['2.0-ULMFiT', 'itos'],
                                  drop_mult=0.5)

learn.model

SequentialRNN(
  (0): RNNCore(
    (encoder): Embedding(10002, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(10002, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=10002, bias=True)
    (output_dp): RNNDropout()
  )
)

Fit one cycle, but keep all layers frozen except the linear encoder and decoder. Start with a realtively low learning rate

In [14]:
if TRAIN:
    learn.fit_one_cycle(1, 1e-2)

In [15]:
if TRAIN:
    learn.unfreeze()
    learn.fit_one_cycle(10, 1e-3)

In [16]:
if TRAIN and SAVE:
    learn.save(f'{model_title}') #WARNING: STATIC TITLE...DONT OVERWRITE MODELS

## Model Load

In [17]:
def cpu_load(self, name:PathOrStr):
    """Load model onto CPU that was trained on a GPU `name` from `self.model_dir`.
       We need these because the fastai load function doesn't allow for a remapping of the storage location."""
    self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=lambda storage, loc: storage))

setattr(RNNLearner, 'cpu_load', cpu_load) #monkey patch onto our RNNLearner

In [18]:
if not GPU:
    learn.cpu_load(f'{model_title}')
else:
    learn.load(f'{model_title}')

## Text Generation

In [81]:
def generate_text(learner, seed_text=['xbos'], max_len=500, GPU=False, context_length=20):
    """Generates text with a given learner and prints string to console.

    Parameters
    ----------
    learner : RNNLearner Language Model (RNNLearner.language_model())
        Fastai RNNLearner with tokenized language model data already loaded 
        
    seed_text : list or str
        List of strings where each item is a token. (e.g. ['the', 'cat']) or string that is split on white space

    max_len : int
        Number of words in generated sequence
        
    gpu : bool
        If you're using a GPU or not...
    
    context_length : int
        Amount of words that get input as "context" into the model. Set to 0 for no limit

    Returns
    -------
    None : NoneType
        Doesn't return anything, prints string to console
    """
        
    model = learner.model
    
    if isinstance(seed_text, str):
        seed_text = seed_text.split(' ')
    
    if GPU:
        context = LongTensor(data_lm.train_ds.vocab.numericalize(seed_text)).view(-1,1).cuda()
    else:
        context = LongTensor(data_lm.train_ds.vocab.numericalize(seed_text)).view(-1,1).cpu()
    
    context = torch.autograd.Variable(context)
    
    # reset model's hidden state
    # we don't want to carry over old contexts
    model.reset()
    model.eval()

    #loop over max length of words
    for _ in range(max_len):
        # forward pass the "context" into the model
        result, *_ = model(context[-context_length:])
        result = result[-1]

        # set unk and pad to 0 prob
        # i.e. never pick unknown or pad
        result[0] = -np.inf
        result[1] = -np.inf

        # softmax and normalize
        probabilities = F.softmax(result, dim=0)
        probabilities = np.asarray(probabilities.detach().cpu(), dtype=np.float)
        probabilities /= np.sum(probabilities) # solve rounding issues for multinom function

        # draw multinom and add to context
        token_index = np.argmax(np.random.multinomial(1, probabilities))
        
        if GPU:
            token_index = LongTensor([token_index]).view(-1, 1).cuda()
        else:
            token_index = LongTensor([token_index]).view(-1, 1).cpu()

        context = torch.cat((context, token_index))    
        
        # print word
        word = data_lm.valid_ds.vocab.textify([token_index])

        if word == 'xeol':
            word = '\n'
        elif 'xbol' in word:
            continue
        elif word == 'xeos': 
            print(word)
            break
            
        print(word, end=' ')    

In [85]:
generate_text(learn, GPU=GPU, seed_text='xbos xbol [verse-1]', max_len=1200, context_length=70)


 there 's a baby lust within my veins 
 of a man like me she takes a chasing ' flame 
 she 's got a better life that hell would bring 
 she 'd take me to evil place 
 but gonna live with her mama 
 comin ' to hell , i went out for my wife 
 
 [chorus] 
 and time goes on like a mother 's love 
 god 's got a son but she ain 't afraid 
 time ain 't working on a mother 's son 
 well , she ain 't wanna wife but a shooting star 
 'cause a woman 's man ain 't an property man 
 and her ain 't gonna pay no mind 
 
 [outro] 
 what 's your heart 
 what 's your soul 
 what 's your life 
 what 's your life 
 xeos


## Export Embedding Matrix

In [21]:
embed = learn.model.state_dict().get('0.encoder.weight').cpu().numpy()
embed

array([[-4.7749771e-05,  6.8505716e-01, -2.0456196e-01, ...,
         5.7345431e-02,  1.7560309e-01, -6.9935806e-02],
       [ 1.7684676e-01,  7.5929588e-01, -8.8490233e-02, ...,
        -3.9831889e-01, -4.3732692e-02, -2.3929070e-01],
       [ 2.9668090e-01,  9.6345782e-01,  5.6869864e-01, ...,
        -9.5396101e-01,  6.0601085e-01, -4.6033749e-01],
       ...,
       [-2.5833974e-02,  4.6223259e-01,  1.9239992e-01, ...,
        -5.3919351e-01, -3.0052951e-01, -4.8572969e-01],
       [-2.4416618e-01,  6.6907865e-01,  2.3509252e-01, ...,
        -1.8074957e-01,  1.2514262e-01, -3.6873725e-01],
       [-3.6761138e-01,  7.0088875e-01,  1.4120845e-01, ...,
        -2.5032681e-01,  2.5526062e-01,  2.9174709e-01]], dtype=float32)

In [22]:
df_embed = pd.DataFrame(data=embed,
                        index=data_lm.train_ds.vocab.itos)

df_embed.to_csv('../data/models/embeddings.csv',
                sep='\t',
                index=False,
                header=False)

In [23]:
df_meta = pd.DataFrame(data=data_lm.train_ds.vocab.itos,
                       columns=['token'])

df_meta.to_csv('../data/models/embeddings_meta.csv',
               sep='\t',
               header=False,
               index=False)

## Graph Embedding Space

The entire semantic and contextual representations that the model truly learns cannot be visualized quite to the mindblowing extent that might be warranted. In order to make this figure, we collapse 400 dimensions of "information" into 3 dimensions so that we can view it in 3 dimensional space. In the process of reducing our dimensionalitye, we lose ~X% of our data's variance. Even with this loss of interpretability, very clear semantic understandings of the language model emerge in the 3D space below.

In [24]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

In [25]:
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn.utils import check_random_state

In [26]:
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

In [27]:
#tsne = manifold.TSNE(n_components=3, init='pca', random_state=0)
#trans_data = tsne.fit_transform(embed).T

In [28]:
# p1 = go.Scatter3d(x=trans_data[0], y=trans_data[1], z=trans_data[2],
#                   mode='markers', 
#                   marker=dict(color=x, 
#                               colorscale=cmap,
#                               showscale=False,
#                               line=dict(color='black', width=1)))

# layout=dict(margin=dict(l=10, r=10,
#                         t=30, b=10)
#            )

# fig = go.Figure(data=[p1], layout=layout)

# py.iplot(fig)