In [21]:
import pandas as pd
import numpy as np
import nltk.tokenize
import itertools

from pathlib import Path

from fastai import *
from fastai.text import *

## Training Data

In [22]:
df = pd.read_csv('https://storage.googleapis.com/w210-capstone/data/lyrics-valid.csv', header=None, escapechar='\\', names=['msd_id', 'lyrics'])
# drop lyrics >5000
df = df[df.lyrics.str.len() < 5000]
df.head()

Unnamed: 0,msd_id,lyrics
0,TRAADJU128F92F58E1,I hear you praying with your hands clasped ove...
1,TRAADQX128F422B4CF,If you ever make it back to Nashville\nRemembe...
2,TRAAFTE128F429545F,Just when I thought I was safe\nYou found me i...
3,TRAAKAG128F4275D2A,Paroles de la chanson Sultao Das Matas :\nSult...
4,TRAAMRO128F92F20D7,From What You Whispered\n........................


### Tokenization

1. First consider each line its own "sentence", keeping track of blanklines
2. Regexp Tokenizer with the following:  
 - Bracket enclosed texts (usually song part header)
 - All words
 - Any numeric -- keep commas and periods together
 - All other non-whitespace characters
3. Wrap each line with `<s>` and `</s>` tokens
4. Wrap each song with `<d>` and `</d>` tokens (documents)

In [24]:
def tokenize_lyrics(lyrics):
    tk = nltk.tokenize.LineTokenizer(blanklines='keep')
    tokd = tk.tokenize(lyrics)
    
    re_tk = nltk.tokenize.RegexpTokenizer(r'\[[^\]]+\]|\w+|[\d\.,]+|\S+',
                                          discard_empty=False)
    re_tokd = re_tk.tokenize_sents(tokd)
    
    [s.insert(0, f'xBOL') for s in re_tokd] # insert start token for each line
    [s.append('xEOL') for s in re_tokd] # append end token for each line
    
    flat = list(itertools.chain(*re_tokd))
    flat.insert(0, 'xBOS')
    flat.append('xEOS')
    # lower case and de-space
    flat = [w.lower().replace(' ', '-') for w in flat]
    return flat

In [25]:
df['tokd'] = df.lyrics.apply(tokenize_lyrics)
df['tokd_len'] = df.tokd.apply(len)
df.head()

Unnamed: 0,msd_id,lyrics,tokd,tokd_len
0,TRAADJU128F92F58E1,I hear you praying with your hands clasped ove...,"[xbos, xbol, i, hear, you, praying, with, your...",215
1,TRAADQX128F422B4CF,If you ever make it back to Nashville\nRemembe...,"[xbos, xbol, if, you, ever, make, it, back, to...",196
2,TRAAFTE128F429545F,Just when I thought I was safe\nYou found me i...,"[xbos, xbol, just, when, i, thought, i, was, s...",186
3,TRAAKAG128F4275D2A,Paroles de la chanson Sultao Das Matas :\nSult...,"[xbos, xbol, paroles, de, la, chanson, sultao,...",58
4,TRAAMRO128F92F20D7,From What You Whispered\n........................,"[xbos, xbol, from, what, you, whispered, xeol,...",310


For now, just save both as both train and valid

## Model file prep

In [26]:
model_title = '2.0-ULMFiT'
MODEL_PATH = Path(f'../data/models/{model_title}')
MODEL_PATH.mkdir(exist_ok=True)

TOKEN_PATH = MODEL_PATH/'tokens'
TOKEN_PATH.mkdir(exist_ok=True)

In [27]:
SAVE_TOKENS = True

In [28]:
if SAVE_TOKENS:
    tokens = np.array(df.tokd)
    
    np.save(MODEL_PATH/'train_tok.npy', tokens)
    np.save(MODEL_PATH/'valid_tok.npy', tokens)

## ULMFiT Language Model

In [29]:
DOWNLOAD_WIKITEXT = True

In [33]:
if DOWNLOAD_WIKITEXT:
    url = 'http://files.fast.ai/models/wt103_v1/'
    download_url(f'{url}lstm_wt103.pth', MODEL_PATH/'models/lstm_wt103.pth')
    download_url(f'{url}itos_wt103.pkl', MODEL_PATH/'models/itos_wt103.pkl')

HBox(children=(IntProgress(value=0, max=221972701), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1027972), HTML(value='')))

In [31]:
data_lm = TextLMDataBunch.from_tokens(MODEL_PATH,
                                      bs=128,
                                      max_vocab=10000)

print(data_lm.train_ds.vocab_size)

Numericalizing train.
Numericalizing valid.
10002


In [12]:
x,y = next(iter(data_lm.train_dl))
example = x[:20,:10].cpu()
texts = pd.DataFrame([data_lm.train_ds.vocab.textify(l).split(' ') for l in example])
print(x.shape, y.shape)
texts

torch.Size([95, 128]) torch.Size([12160])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,xbos,xeol,'s,[outro],will,xeol,so,xbol,won,my
1,xbol,xbol,sad,xeol,destroy,xbol,i,i,'t,soul
2,i,i,but,xbol,her,i,fight,said,deal,xeol
3,hear,'ll,you,i,xeol,'ve,xeol,",",xeol,xeos
4,you,do,seem,'ll,xbol,been,xbol,it,xbol,xbos
5,praying,anything,better,never,so,takin,i,'s,i,xbol
6,with,for,when,know,i,',cannot,my,won,xxunk
7,your,you,i,xeol,'m,all,shake,life,'t,xxunk
8,hands,",",'m,xbol,giving,the,from,xeol,change,em
9,xxunk,show,gone,i,up,blame,my,xbol,",",busca


Load ULMFiT Model architecture and create and embedding matrix that includes the new words. The new words are initialized to the mean value of all prior vocab...

TODO: maybe update the initialization points to the mean value of prior vocab that we keep in this model. e.g. average of the words that are in the lyrics corpus.

In [34]:
learn = RNNLearner.language_model(data_lm,
                                  pretrained_fnames=['lstm_wt103', 'itos_wt103'],
                                  drop_mult=0.5)

learn.model

SequentialRNN(
  (0): RNNCore(
    (encoder): Embedding(10002, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(10002, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=10002, bias=True)
    (output_dp): RNNDropout()
  )
)

Fit one cycle, but keep all layers frozen except the linear encoder and decoder. Start with a realtively low learning rate

In [None]:
learn.fit_one_cycle(1, 1e-2)

VBox(children=(HBox(children=(IntProgress(value=0, max=1), HTML(value='0.00% [0/1 00:00<00:00]'))), HTML(value…

In [50]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3)

VBox(children=(HBox(children=(IntProgress(value=0, max=10), HTML(value='0.00% [0/10 00:00<00:00]'))), HTML(val…

Total time: 3:14:25
epoch  train loss  valid loss  accuracy
0      3.221238    3.089552    0.416633  (19:27)
1      3.072464    2.944254    0.437782  (19:26)
2      2.959016    2.841134    0.452242  (19:25)
3      2.886021    2.753556    0.464477  (19:26)
4      2.822960    2.687229    0.474104  (19:30)
5      2.759549    2.625144    0.483686  (19:24)
6      2.715732    2.583514    0.490242  (19:26)
7      2.670489    2.546307    0.495953  (19:26)
8      2.656668    2.527033    0.499007  (19:24)
9      2.673330    2.524118    0.499475  (19:27)



In [51]:
learn.save(f'{model_title}')

In [52]:
GPU = True

In [53]:
def cpu_load(self, name:PathOrStr):
    "Load model onto CPU that was trained on a GPU `name` from `self.model_dir`."
    self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=lambda storage, loc: storage))

setattr(RNNLearner, 'cpu_load', cpu_load) #monkey patch onto our RNNLearner

In [54]:
if not GPU:
    learn.cpu_load(f'{model_title}')
else:
    learn.load(f'{model_title}')

## Text Generation

In [59]:
def generate_text(learner, seed_text=['xbos'], max_len=500, GPU=False):
    """Generates text with a given learner and prints string to console.

    Parameters
    ----------
    learner : RNNLearner Language Model (RNNLearner.language_model())
        Fastai RNNLearner with tokenized language model data already loaded 
        
    seed_text : list
        List of strings where each item is a token. (e.g. ['the', 'cat'])

    max_len : int
        Number of words in generated sequence

    Returns
    -------
    None : NoneType
        Doesn't return anything, prints string to console
    """
        
    model = learner.model
    
    if GPU:
        context = LongTensor(data_lm.train_ds.vocab.numericalize(seed_text)).view(-1,1).cuda()
    else:
        context = LongTensor(data_lm.train_ds.vocab.numericalize(seed_text)).view(-1,1).cpu()
    
    context = torch.autograd.Variable(context)
    
    # reset model's hidden state
    # we don't want to carry over old contexts
    model.reset()
    model.eval()

    #loop over max length of words
    for _ in range(max_len):
        # forward pass the "context" into the model
        result, *_ = model(context)
        result = result[-1]

        # set unk and pad to 0 prob
        # i.e. never pick unknown or pad
        result[0] = -np.inf
        result[1] = -np.inf

        # softmax
        probabilities = F.softmax(result, dim=0)
        probabilities = np.asarray(probabilities.detach().cpu(), dtype=np.float)
        probabilities /= np.sum(probabilities) # solve rounding issues for multinom function

        # draw multinom and add to context
        token_index = np.argmax(np.random.multinomial(1, probabilities))
        
        if GPU:
            token_index = LongTensor([token_index]).view(-1, 1).cuda()
        else:
            token_index = LongTensor([token_index]).view(-1, 1).cpu()

        context = torch.cat((context, token_index))    
        
        # print word
        word = data_lm.valid_ds.vocab.textify([token_index])

        if word == 'xeol':
            word = '\n'
        elif 'xbol' in word:
             continue
        elif word == 'xeos': 
            break
            
        print(word, end=' ')    

In [None]:
generate_text(learn, GPU=GPU, seed_text=['xbos', 'xbol'], max_len=2000)

## Export Embedding Matrix

In [25]:
embed = learn.model.state_dict().get('0.encoder.weight').cpu().numpy()
embed

array([[ 0.49085233, -0.25270373, -0.18936859, ..., -0.10786957,
        -0.15040247,  0.08028701],
       [ 0.38108253,  0.46519557, -0.04423388, ..., -0.32877025,
         0.544967  , -0.6305809 ],
       [ 0.2315303 ,  0.01689303,  0.5893272 , ..., -0.02504184,
        -0.28303975,  0.18284719],
       ...,
       [ 0.21922407,  0.32938698,  0.04211523, ..., -0.2696808 ,
         1.0012609 , -0.22526287],
       [ 0.69269246,  0.845887  , -0.0607295 , ..., -0.33333674,
         1.2671981 , -0.8235619 ],
       [ 0.54437697,  1.2663454 , -0.09743147, ..., -0.84379476,
         1.0860044 , -0.31181222]], dtype=float32)

In [26]:
df_embed = pd.DataFrame(data=embed,
                        index=data_lm.train_ds.vocab.itos)

df_embed.to_csv('../data/models/embeddings.csv',
                sep='\t',
                index=False,
                header=False)

In [27]:
df_meta = pd.DataFrame(data=data_lm.train_ds.vocab.itos,
                       columns=['token'])

df_meta.to_csv('../data/models/embeddings_meta.csv',
               sep='\t',
               header=False,
               index=False)

## Graph Embedding Space

The entire semantic and contextual representations that the model truly learns cannot be visualized quite to the mindblowing extent that might be warranted. In order to make this figure, we collapse 400 dimensions of "information" into 3 dimensions so that we can view it in 3 dimensional space. In the process of reducing our dimensionalitye, we lose ~X% of our data's variance. Even with this loss of interpretability, very clear semantic understandings of the language model emerge in the 3D space below.

In [13]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

In [14]:
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn.utils import check_random_state

In [15]:
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

In [28]:
tsne = manifold.TSNE(n_components=3, init='pca', random_state=0)
trans_data = tsne.fit_transform(embed).T

KeyboardInterrupt: 

In [None]:
p1 = go.Scatter3d(x=trans_data[0], y=trans_data[1], z=trans_data[2],
                  mode='markers', 
                  marker=dict(color=x, 
                              colorscale=cmap,
                              showscale=False,
                              line=dict(color='black', width=1)))

layout=dict(margin=dict(l=10, r=10,
                        t=30, b=10)
           )

fig = go.Figure(data=[p1], layout=layout)

py.iplot(fig)