In [3]:
import pandas as pd
import numpy as np
import nltk.tokenize
import itertools

## Training Data

In [2]:
df = pd.read_csv('../data/interim/subset-10k/genius_lyrics.csv')
# drop lyrics >5000
df = df[df.lyrics.str.len() < 5000]
df.head()

Unnamed: 0,msd_id,lyrics
0,TRAAAAW128F429D538,"\n\n[Verse 1]\nAlright, I might\nHave had a li..."
1,TRAAABD128F429CF47,"\n\nDarling, I don't know much\nBut I know I l..."
2,TRAAADZ128F9348C2E,\n\nSiento una pena muy honda dentro del alma\...
3,TRAAAEF128F4273421,\n\nAdam Ant/Marco Pirroni\nEvery girl is a so...
4,TRAAAFD128F92F423A,"\n\nI've just erased it's been a while, I've g..."


### Tokenization

1. First consider each line its own "sentence", keeping track of blanklines
2. Regexp Tokenizer with the following:  
 - Bracket enclosed texts (usually song part header)
 - All words
 - Any numeric -- keep commas and periods together
 - All other non-whitespace characters
3. Wrap each line with `<s>` and `</s>` tokens
4. Wrap each song with `<d>` and `</d>` tokens (documents)

In [22]:
def tokenize_lyrics(lyrics):
    tk = nltk.tokenize.LineTokenizer(blanklines='keep')
    tokd = tk.tokenize(lyrics)
    
    re_tk = nltk.tokenize.RegexpTokenizer(r'\[[^\]]+\]|\w+|[\d\.,]+|\S+',
                                          discard_empty=False)
    re_tokd = re_tk.tokenize_sents(tokd)
    
    [s.insert(0, '<s>') for s in re_tokd] # insert start token for each line
    [s.append('</s>') for s in re_tokd] # append end token for each line
    
    flat = list(itertools.chain(*re_tokd))
    flat.insert(0, '<d>')
    flat.append('</d>')
    # lower case and de-space
    flat = [w.lower().replace(' ', '-') for w in flat]
    return flat

In [21]:
df['tokd'] = df.lyrics.apply(tokenize_lyrics)
df['tokd_len'] = df.tokd.apply(len)
df.head()

Unnamed: 0,msd_id,lyrics,tokd,tokd_len
0,TRAAAAW128F429D538,"\n\n[Verse 1]\nAlright, I might\nHave had a li...","[<d>, <s>, </s>, <s>, </s>, <s>, [verse-1], </...",651
1,TRAAABD128F429CF47,"\n\nDarling, I don't know much\nBut I know I l...","[<d>, <s>, </s>, <s>, </s>, <s>, darling, ,, i...",361
2,TRAAADZ128F9348C2E,\n\nSiento una pena muy honda dentro del alma\...,"[<d>, <s>, </s>, <s>, </s>, <s>, siento, una, ...",127
3,TRAAAEF128F4273421,\n\nAdam Ant/Marco Pirroni\nEvery girl is a so...,"[<d>, <s>, </s>, <s>, </s>, <s>, adam, ant, /m...",322
4,TRAAAFD128F92F423A,"\n\nI've just erased it's been a while, I've g...","[<d>, <s>, </s>, <s>, </s>, <s>, i, 've, just,...",181


For now, just save both as both train and valid

In [48]:
token_path = '../data/interim/subset-10k/tok_test'
tok_test = np.array(df.tokd)
np.save(token_path+'/train_tok.npy', tok_test)
np.save(token_path+'/valid_tok.npy', tok_test)

In [24]:
tok_test[0]

['<d>',
 '<s>',
 '</s>',
 '<s>',
 '</s>',
 '<s>',
 '[verse-1]',
 '</s>',
 '<s>',
 'alright',
 ',',
 'i',
 'might',
 '</s>',
 '<s>',
 'have',
 'had',
 'a',
 'little',
 'glare',
 'when',
 'i',
 'stared',
 'at',
 'ya',
 'ho',
 '</s>',
 '<s>',
 'but',
 'i',
 'didn',
 "'t",
 'know',
 'she',
 'was',
 'like',
 'that',
 '</s>',
 '<s>',
 'she',
 'stared',
 'right',
 'back',
 '</s>',
 '<s>',
 'my',
 'niggas',
 'warnin',
 'me',
 'that',
 'she',
 'was',
 'comin',
 'on',
 'to',
 'me',
 '</s>',
 '<s>',
 'i',
 'react',
 'like',
 'a',
 'mack',
 'do',
 ',',
 'i',
 'act',
 'cool',
 '</s>',
 '<s>',
 'just',
 'to',
 'test',
 'her',
 ',',
 "'cause",
 'i',
 "'m",
 'no',
 'jester',
 '</s>',
 '<s>',
 'i',
 'suggest',
 'her',
 'and',
 'her',
 'friend',
 'be',
 'outtie',
 '</s>',
 '<s>',
 'because',
 'i',
 'don',
 "'t",
 'want',
 'to',
 'make',
 'my',
 'pal',
 'get',
 'rowdy',
 '</s>',
 '<s>',
 'and',
 'doubt',
 'me',
 ',',
 'our',
 'friendship',
 '</s>',
 '<s>',
 'but',
 'when',
 'lips',
 'touch',
 ',',
 'i',


## ULMFiT Language Model

In [10]:
from pathlib import Path

from fastai import *
from fastai.text import *

In [93]:
data_lm = TextLMDataBunch.from_tokens(token_path,
                                      bs=128,
                                      max_vocab=10000)
print(data_lm.train_ds.vocab_size)

10002


In [94]:
x,y = next(iter(data_lm.train_dl))
example = x[:20,:10].cpu()
texts = pd.DataFrame([data_lm.train_ds.vocab.textify(l).split(' ') for l in example])
print(x.shape, y.shape)
texts

torch.Size([95, 128]) torch.Size([12160])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,<d>,la,</s>,ticking,<s>,us,<s>,</s>,it,bumper
1,<s>,vie,<s>,and,</s>,</s>,if,<s>,shines,inna
2,</s>,des,i,the,<s>,<s>,my,</s>,like,di
3,<s>,xxunk,know,fuse,talking,on,echoes,<s>,laser,air
4,</s>,",",what,is,off,some,can,jose,light,</s>
5,<s>,xxunk,we,burning,about,xxunk,reach,:,</s>,<s>
6,[verse-1],",",have,</s>,bro,shit,your,oh,<s>,and
7,</s>,j,is,<s>,</s>,(we,ears,listen,(on,jook
8,<s>,’suis,a,i,<s>,in,</s>,i,the,gal
9,alright,dans,lie,need,you,the,<s>,'m,run,(jook


Load ULMFiT Model architecture and create and embedding matrix that includes the new words. The new words are initialized to the mean value of all prior vocab...

TODO: maybe update the initialization points to the mean value of prior vocab that we keep in this model. e.g. average of the words that are in the lyrics corpus.

In [95]:
learn = RNNLearner.language_model(data_lm,
                                  pretrained_fnames=['lstm_wt103', 'itos_wt103'],
                                  drop_mult=0.5)
learn.model

SequentialRNN(
  (0): RNNCore(
    (encoder): Embedding(10002, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(10002, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=10002, bias=True)
    (output_dp): RNNDropout()
  )
)

Take a glance at the new words that we've added to our vocabularly -- we add quite a bit. This is expected because this is such a specialized corpus.

In [96]:
with open('../data/imdb_sample/models/itos_wt103.pkl', 'rb') as f:
    aa = pickle.load(f)
aa = set(aa)
bb = data_lm.train_ds.vocab.itos
new_words = [w for w in bb if w not in aa]
new_words_id = data_lm.train_ds.vocab.numericalize(new_words)
print("New Vocab: ", len(new_words))
print(new_words[:20])

New Vocab:  3200
['<s>', '</s>', "'t", "'m", '<d>', '</d>', "'ve", 'gonna', '[chorus]', 'nigga', 'gotta', "'cause", '’t', '’s', '[hook]', '[verse-1]', '[verse-2]', '’m', "'all", "',"]


In [97]:
learn.model.state_dict().get('0.encoder.weight')

tensor([[ 0.1810,  0.5646, -0.1060,  ...,  0.4994,  0.0770, -0.2302],
        [ 0.1604,  0.4524,  0.0713,  ..., -0.0967, -0.1500, -0.2420],
        [ 0.1753,  0.5136,  0.0942,  ..., -0.1172, -0.1426, -0.2452],
        ...,
        [ 0.1253,  0.3070, -0.0439,  ..., -0.2400, -0.1736, -0.2471],
        [ 0.3863,  0.1121, -0.0464,  ...,  0.1902, -0.1707, -0.1216],
        [ 0.6130,  0.5969, -0.0734,  ..., -0.0203, -0.0900, -0.2581]],
       device='cuda:0')

Fit one cycle, but keep all layers frozen except the linear encoder and decoder. Start with a realtively low learning rate

In [98]:
learn.fit_one_cycle(1, 1e-2)

VBox(children=(HBox(children=(IntProgress(value=0, max=1), HTML(value='0.00% [0/1 00:00<00:00]'))), HTML(value…

Total time: 04:01
epoch  train loss  valid loss  accuracy
0      4.169597    3.902785    0.330811  (04:01)



In [144]:
learn.model.state_dict().get('0.encoder.weight')

tensor([[ 0.2691,  0.6207, -0.3944,  ...,  0.5027,  0.2028, -0.2300],
        [ 0.2803,  0.9418, -0.1011,  ..., -0.3540, -0.0756, -0.4386],
        [-0.3792,  0.4529,  0.2847,  ..., -0.3854, -0.2198, -0.2500],
        ...,
        [ 0.2301,  0.5036, -0.1625,  ..., -0.2492, -0.0224, -0.2389],
        [ 0.5351,  0.1089, -0.0427,  ...,  0.1091, -0.1642, -0.0938],
        [ 0.5106,  0.5838, -0.0701,  ..., -0.0717,  0.0637, -0.2155]],
       device='cuda:0')

In [100]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3)

VBox(children=(HBox(children=(IntProgress(value=0, max=5), HTML(value='0.00% [0/5 00:00<00:00]'))), HTML(value…

Total time: 24:05
epoch  train loss  valid loss  accuracy
0      3.842185    3.609371    0.372844  (04:50)
1      3.633488    3.416342    0.394314  (04:48)
2      3.493614    3.282526    0.409824  (04:48)
3      3.388098    3.205011    0.418733  (04:47)
4      3.360687    3.182761    0.421562  (04:51)



In [147]:
learn.save('10k-subset-custom-tok')
learn.load('10k-subset-custom-tok')

## Text Generation

In [101]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/syang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [401]:
m = learn.model
# s = word_tokenize("<d><s></s><s></s>")
s = ['<d>', '<s>', '</s>', '<s>', '</s>',
     '<s>', '[verse]', '</s>']
# s = ['<d>']
t = LongTensor(data_lm.train_ds.vocab.numericalize(s)).view(-1,1).cuda()
t = torch.autograd.Variable(t)

### Multinomial

In [402]:
# generation with multinomial
# there is some bug with torch library with torch.multinomial
# that throws and exception and make you restart the kernel
# reverting to numpy
m.reset()
m.eval()

# print starting seed
for s in t:
    to_print = data_lm.valid_ds.vocab.textify(s)
    if to_print == '</s>':
        to_print = '\n'
    print(to_print, end=' ')

for i in range(250):
    # forward pass
    res, *_ = m(t)
    r = res[-1]
    
    # set unk and pad to 0 prob
    r[0] = -np.inf
    r[1] = -np.inf
    
    # softmax
    r2 = F.softmax(r, dim=0)
    r2 = np.asarray(r2.detach().cpu(), dtype=np.float)
    r2 /= np.sum(r2) # solve rounding issues for multinom function
    
    # draw multinom
    n = np.argmax(np.random.multinomial(1, r2))
    
    word = data_lm.valid_ds.vocab.textify([n])
    n = LongTensor([n]).view(-1, 1).cuda()
    t = torch.cat((t, n))    
    if word == '</s>':
        word = '\n'
    print(word, end=' ')
    if word == '</d>': break

<d> <s> 
 <s> 
 <s> [verse] 
 <s> everything looks good like town 
 <s> but maybe every night i 'll be your man 
 <s> 
 <s> [chorus] 
 <s> my little girl is in a love song 
 <s> she 's been at home after midnight night 
 <s> a night long ago , honey 
 <s> 
 <s> [verse-3] 
 <s> well , she was ten feet tall than her 
 <s> child 's dress , but she was was looking back 
 <s> she mean you were a bad man ; she been and then she was dancing 
 <s> had to get her friends somethin ' too hard and she had trouble 
 <s> ain 't no place for me to hold her smiling 
 <s> my temperature was so warm that every day you stopped song 
 <s> so you should put me down and i 'll do something 
 <s> but never be well home 
 <s> 
 <s> [pre-hook] 
 <s> (it 's all so alright 
 <s> this one is a grown man <s> 
 <s> [outro] 
 <s> well i 'm sure 
 <s> i 'll be climbing saturday night 
 <s> 
 <s> shining in high heels 
 <s> shining up in a cradle 
 <s> i 'm smile 
 <s> i know that you 'll find me 
 <s> there 's a key 


## Export Embedding Matrix

In [356]:
embed = learn.model.state_dict().get('0.encoder.weight').cpu().numpy()
embed

array([[ 0.26910752,  0.6206992 , -0.39436367, ...,  0.5027352 ,
         0.202811  , -0.2299545 ],
       [ 0.28033113,  0.9418297 , -0.10114889, ..., -0.35397708,
        -0.07556843, -0.43856415],
       [-0.37922502,  0.4529314 ,  0.2846954 , ..., -0.3854074 ,
        -0.2197569 , -0.24996242],
       ...,
       [ 0.23007387,  0.5036217 , -0.1624992 , ..., -0.24915165,
        -0.02237413, -0.2388532 ],
       [ 0.535128  ,  0.10889628, -0.04272555, ...,  0.10906475,
        -0.16416293, -0.09379104],
       [ 0.5106017 ,  0.5837977 , -0.07005332, ..., -0.07169558,
         0.06370288, -0.21551183]], dtype=float32)

In [375]:
df_embed = pd.DataFrame(data=embed,
                        index=data_lm.train_ds.vocab.itos)
df_embed.to_csv('../data/models/embeddings.csv',
                sep='\t',
                index=False,
                header=False)

In [378]:
df_meta = pd.DataFrame(data=data_lm.train_ds.vocab.itos,
                       columns=['token'])
df_meta.to_csv('../data/models/embeddings_meta.csv',
               sep='\t',
               header=False,
               index=False)

In [4]:
np.array([[ 0.26910752,  0.6206992 , -0.39436367, 0.5027352 ,
         0.202811  , -0.2299545 ],
       [ 0.28033113,  0.9418297 , -0.10114889, -0.35397708,
        -0.07556843, -0.43856415],
       [-0.37922502,  0.4529314 ,  0.2846954 , -0.3854074 ,
        -0.2197569 , -0.24996242],
       [ 0.23007387,  0.5036217 , -0.1624992 , -0.24915165,
        -0.02237413, -0.2388532 ],
       [ 0.535128  ,  0.10889628, -0.04272555,  0.10906475,
        -0.16416293, -0.09379104],
       [ 0.5106017 ,  0.5837977 , -0.07005332, -0.07169558,
         0.06370288, -0.21551183]])

array([[ 0.26910752,  0.6206992 , -0.39436367,  0.5027352 ,  0.202811  ,
        -0.2299545 ],
       [ 0.28033113,  0.9418297 , -0.10114889, -0.35397708, -0.07556843,
        -0.43856415],
       [-0.37922502,  0.4529314 ,  0.2846954 , -0.3854074 , -0.2197569 ,
        -0.24996242],
       [ 0.23007387,  0.5036217 , -0.1624992 , -0.24915165, -0.02237413,
        -0.2388532 ],
       [ 0.535128  ,  0.10889628, -0.04272555,  0.10906475, -0.16416293,
        -0.09379104],
       [ 0.5106017 ,  0.5837977 , -0.07005332, -0.07169558,  0.06370288,
        -0.21551183]])

In [5]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn.utils import check_random_state

ModuleNotFoundError: No module named 'plotly'

In [None]:
tsne = manifold.TSNE(n_components=3, init='pca', random_state=0)
trans_data = tsne.fit_transform(embed).T

In [None]:
p1 = go.Scatter3d(x=trans_data[0], y=trans_data[1], z=trans_data[2],
                  mode='markers', 
                  marker=dict(color=x, 
                              colorscale=cmap,
                              showscale=False,
                              line=dict(color='black', width=1)))
layout=dict(margin=dict(l=10, r=10,
                        t=30, b=10)
           )

fig = go.Figure(data=[p1], layout=layout)

py.iplot(fig)

In [None]:
py.iplot(fig)