In [1]:
import pandas as pd
import numpy as np
import nltk.tokenize
import itertools

## Training Data

In [2]:
df = pd.read_csv('https://storage.googleapis.com/w210-capstone/data/lyrics-valid.csv', header=None, escapechar='\\', names=['msd_id', 'lyrics'])
# drop lyrics >5000
df = df[df.lyrics.str.len() < 5000]
df.head()

Unnamed: 0,msd_id,lyrics
0,TRAADJU128F92F58E1,I hear you praying with your hands clasped ove...
1,TRAADQX128F422B4CF,If you ever make it back to Nashville\nRemembe...
2,TRAAFTE128F429545F,Just when I thought I was safe\nYou found me i...
3,TRAAKAG128F4275D2A,Paroles de la chanson Sultao Das Matas :\nSult...
4,TRAAMRO128F92F20D7,From What You Whispered\n........................


### Tokenization

1. First consider each line its own "sentence", keeping track of blanklines
2. Regexp Tokenizer with the following:  
 - Bracket enclosed texts (usually song part header)
 - All words
 - Any numeric -- keep commas and periods together
 - All other non-whitespace characters
3. Wrap each line with `<s>` and `</s>` tokens
4. Wrap each song with `<d>` and `</d>` tokens (documents)

In [4]:
def tokenize_lyrics(lyrics):
    tk = nltk.tokenize.LineTokenizer(blanklines='keep')
    tokd = tk.tokenize(lyrics)
    
    re_tk = nltk.tokenize.RegexpTokenizer(r'\[[^\]]+\]|\w+|[\d\.,]+|\S+',
                                          discard_empty=False)
    re_tokd = re_tk.tokenize_sents(tokd)
    
    [s.insert(0, '<s>') for s in re_tokd] # insert start token for each line
    [s.append('</s>') for s in re_tokd] # append end token for each line
    
    flat = list(itertools.chain(*re_tokd))
    flat.insert(0, '<d>')
    flat.append('</d>')
    # lower case and de-space
    flat = [w.lower().replace(' ', '-') for w in flat]
    return flat

In [5]:
df['tokd'] = df.lyrics.apply(tokenize_lyrics)
df['tokd_len'] = df.tokd.apply(len)
df.head()

Unnamed: 0,msd_id,lyrics,tokd,tokd_len
0,TRAADJU128F92F58E1,I hear you praying with your hands clasped ove...,"[<d>, <s>, i, hear, you, praying, with, your, ...",215
1,TRAADQX128F422B4CF,If you ever make it back to Nashville\nRemembe...,"[<d>, <s>, if, you, ever, make, it, back, to, ...",196
2,TRAAFTE128F429545F,Just when I thought I was safe\nYou found me i...,"[<d>, <s>, just, when, i, thought, i, was, saf...",186
3,TRAAKAG128F4275D2A,Paroles de la chanson Sultao Das Matas :\nSult...,"[<d>, <s>, paroles, de, la, chanson, sultao, d...",58
4,TRAAMRO128F92F20D7,From What You Whispered\n........................,"[<d>, <s>, from, what, you, whispered, </s>, <...",310


For now, just save both as both train and valid

In [3]:
token_path = '../data/interim/'

In [None]:
tok_test = np.array(df.tokd)
np.save(token_path+'/train_tok.npy', tok_test)
np.save(token_path+'/valid_tok.npy', tok_test)

In [7]:
tok_test[0][0:35]

['<d>',
 '<s>',
 'i',
 'hear',
 'you',
 'praying',
 'with',
 'your',
 'hands',
 'clasped',
 'over',
 'your',
 'chest',
 '</s>',
 '<s>',
 'i',
 'hear',
 'men',
 'slaying',
 'while',
 'they',
 'say',
 '"keep',
 'doing',
 'your',
 'best',
 '"',
 '</s>',
 '<s>',
 'i',
 'hear',
 'the',
 'laughter',
 'of',
 'someone']

## ULMFiT Language Model

In [1]:
from pathlib import Path

from fastai import *
from fastai.text import *

ValueError: cymem.cymem.Pool has the wrong size, try recompiling. Expected 64, got 48

In [None]:
data_lm = TextLMDataBunch.from_tokens(token_path,
                                      bs=128,
                                      max_vocab=10000)
print(data_lm.train_ds.vocab_size)

In [4]:
x,y = next(iter(data_lm.train_dl))
example = x[:20,:10].cpu()
texts = pd.DataFrame([data_lm.train_ds.vocab.textify(l).split(' ') for l in example])
print(x.shape, y.shape)
texts

torch.Size([95, 128]) torch.Size([12160])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,<d>,</s>,'s,[outro],will,</s>,so,<s>,won,my
1,<s>,<s>,sad,</s>,destroy,<s>,i,i,'t,soul
2,i,i,but,<s>,her,i,fight,said,deal,</s>
3,hear,'ll,you,i,</s>,'ve,</s>,",",</s>,</d>
4,you,do,seem,'ll,<s>,been,<s>,it,<s>,<d>
5,praying,anything,better,never,so,takin,i,'s,i,<s>
6,with,for,when,know,i,',cannot,my,won,xxunk
7,your,you,i,</s>,'m,all,shake,life,'t,xxunk
8,hands,",",'m,<s>,giving,the,from,</s>,change,em
9,xxunk,show,gone,i,up,blame,my,<s>,",",busca


Load ULMFiT Model architecture and create and embedding matrix that includes the new words. The new words are initialized to the mean value of all prior vocab...

TODO: maybe update the initialization points to the mean value of prior vocab that we keep in this model. e.g. average of the words that are in the lyrics corpus.

In [4]:
learn = RNNLearner.language_model(data_lm,
                                  #pretrained_fnames=['lstm_wt103', 'itos_wt103'],
                                  drop_mult=0.5)
learn.model

SequentialRNN(
  (0): RNNCore(
    (encoder): Embedding(10002, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(10002, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 400)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=10002, bias=True)
    (output_dp): RNNDropout()
  )
)

Take a glance at the new words that we've added to our vocabularly -- we add quite a bit. This is expected because this is such a specialized corpus.

In [11]:
with open('../data/imdb_sample/models/itos_wt103.pkl', 'rb') as f:
    aa = pickle.load(f)
aa = set(aa)
bb = data_lm.train_ds.vocab.itos
new_words = [w for w in bb if w not in aa]
new_words_id = data_lm.train_ds.vocab.numericalize(new_words)
print("New Vocab: ", len(new_words))
print(new_words[:20])

FileNotFoundError: [Errno 2] No such file or directory: '../data/imdb_sample/models/itos_wt103.pkl'

In [5]:
learn.model.state_dict().get('0.encoder.weight')

tensor([[-0.0314, -0.0846, -0.0673,  ..., -0.0922, -0.0721, -0.0338],
        [ 0.0595, -0.0232,  0.0071,  ..., -0.0663, -0.0444,  0.0865],
        [-0.0644,  0.0233, -0.0846,  ..., -0.0684, -0.0425,  0.0267],
        ...,
        [-0.0347,  0.0805, -0.0097,  ...,  0.0473,  0.0058, -0.0504],
        [ 0.0009,  0.0383, -0.0344,  ...,  0.0696, -0.0689,  0.0592],
        [ 0.0415,  0.0318, -0.0593,  ...,  0.0845, -0.0466, -0.0619]],
       device='cuda:0')

Fit one cycle, but keep all layers frozen except the linear encoder and decoder. Start with a realtively low learning rate

In [13]:
learn.fit_one_cycle(1, 1e-2)

VBox(children=(HBox(children=(IntProgress(value=0, max=1), HTML(value='0.00% [0/1 00:00<00:00]'))), HTML(value…

Total time: 19:12
epoch  train loss  valid loss  accuracy
0      3.142067    3.021067    0.429976  (19:12)



In [14]:
learn.model.state_dict().get('0.encoder.weight')

tensor([[ 0.2809,  0.0228,  0.0205,  ...,  0.0531, -0.2897,  0.5122],
        [ 0.3141,  0.1556, -1.2372,  ...,  0.8679, -0.3641,  0.7379],
        [-1.3342,  0.1629, -0.1386,  ...,  0.7200, -0.1327,  0.5287],
        ...,
        [-0.0839, -0.2861, -1.0583,  ...,  0.1260, -0.8269,  1.5154],
        [ 0.4406, -0.5391, -1.1914,  ..., -0.2362, -2.1606,  1.1331],
        [ 0.9451,  0.8559, -2.0430,  ..., -0.0076, -1.3519,  1.7898]],
       device='cuda:0')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3)

VBox(children=(HBox(children=(IntProgress(value=0, max=10), HTML(value='0.00% [0/10 00:00<00:00]'))), HTML(val…

In [27]:
learn.save('100k-custom-tok')

In [8]:
learn.load('100k-custom-tok')

## Text Generation

In [17]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /home/j_rosen_1392/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
def generate_text(learner, seed_text=['<d>', '<s>', '</s>', '<s>', '</s>','<s>', '[verse]', '</s>'], max_len=2500):
    model = learner.model
    tokens = LongTensor(data_lm.train_ds.vocab.numericalize(seed_text)).view(-1,1).cuda()
    token_seq = torch.autograd.Variable(tokens)
    
    #reset hidden state
    model.reset()
    model.eval()

    #loop over each max length of words
    for _ in range(max_len):
        # forward pass
        result, *_ = model(token_seq)
        r = result[-1]

        # set unk and pad to 0 prob
        # i.e. never pick unknown or pad
        r[0] = -np.inf
        r[1] = -np.inf

        # softmax
        r2 = F.softmax(r, dim=0)
        r2 = np.asarray(r2.detach().cpu(), dtype=np.float)
        r2 /= np.sum(r2) # solve rounding issues for multinom function

        # draw multinom
        token_index = np.argmax(np.random.multinomial(1, r2))

        word = data_lm.valid_ds.vocab.textify([token_index])
        token_index = LongTensor([token_index]).view(-1, 1).cuda()
        token_seq = torch.cat((token_seq, token_index))    
        if word == '</s>':
            word = '\n'
        print(word, end=' ')
        if word == '</d>': break    
    

In [10]:
generate_text(learn)

<s> and i will move 
 <s> far from left and giving up 
 <s> it was that for sure 
 <s> and i , that you believe 
 <s> we 're like a bird 
 <s> and there 's something 
 <s> you 've seen that 
 <s> the best things 
 <s> are true 
 <s> if i could 
 <s> i would have listened 
 <s> erase the regret 
 <s> 
 <s> i had some ways 
 <s> for the oh , the man 
 <s> that left 're through 
 <s> so , let 's run 
 <s> 
 <s> back again 
 <s> pushed away 
 <s> i 'm out of time 
 <s> out of my thoughts 
 <s> you said i told you 
 <s> still saying 
 <s> 
 <s> [chorus] 
 <s> never fall so down 
 <s> i 'll never be away to losing you 
 <s> so glad that i found you 
 <s> all you ever wanted was you 
 <s> only in my dreams through the miles 
 <s> could it be what you wanted ? 
 <s> all i love was tenderly 
 <s> not now , now , now 
 <s> 
 <s> i do ’d like to know 
 <s> if i called it 
 <s> i tell that you would 
 <s> i wish that i wasn 't meant to 
 <s> yeah , the feelings and the days i 'd change 
 <s> 
 <s>

## Export Embedding Matrix

In [26]:
embed = learn.model.state_dict().get('0.encoder.weight').cpu().numpy()
embed

array([[ 3.27236950e-01,  7.88899288e-02,  1.41409770e-01, ...,
         1.18825608e-03, -2.78300911e-01,  2.75513470e-01],
       [ 3.01054031e-01,  2.12108001e-01, -1.18636227e+00, ...,
         8.79542947e-01, -3.23059350e-01,  7.22483337e-01],
       [-1.08073759e+00,  1.05993815e-01,  3.97628546e-02, ...,
         7.56273389e-01, -1.93531737e-01,  4.99946654e-01],
       ...,
       [ 1.91946849e-02,  8.63057375e-03, -5.99980175e-01, ...,
        -1.33894552e-02, -6.13913774e-01,  8.61503780e-01],
       [ 2.98022062e-01, -1.85456827e-01, -8.70638669e-01, ...,
         7.70014673e-02, -1.51546502e+00,  8.48747432e-01],
       [ 6.35387242e-01,  5.96060991e-01, -1.86128807e+00, ...,
        -1.65299833e-01, -1.06414533e+00,  1.37599111e+00]], dtype=float32)

In [375]:
df_embed = pd.DataFrame(data=embed,
                        index=data_lm.train_ds.vocab.itos)
df_embed.to_csv('../data/models/embeddings.csv',
                sep='\t',
                index=False,
                header=False)

In [378]:
df_meta = pd.DataFrame(data=data_lm.train_ds.vocab.itos,
                       columns=['token'])
df_meta.to_csv('../data/models/embeddings_meta.csv',
               sep='\t',
               header=False,
               index=False)