# Language Model with ELMo Embeddings

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import keras
import keras.layers as layers
from keras.models import Model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np

## Training Data

In [3]:
df = pd.read_csv('../data/interim/subset-10k/genius_lyrics.csv')
# drop lyrics >5000
df = df[df.lyrics.str.len() < 5000]
df.head()

Unnamed: 0,msd_id,lyrics
0,TRAAAAW128F429D538,"\n\n[Verse 1]\nAlright, I might\nHave had a li..."
1,TRAAABD128F429CF47,"\n\nDarling, I don't know much\nBut I know I l..."
2,TRAAADZ128F9348C2E,\n\nSiento una pena muy honda dentro del alma\...
3,TRAAAEF128F4273421,\n\nAdam Ant/Marco Pirroni\nEvery girl is a so...
4,TRAAAFD128F92F423A,"\n\nI've just erased it's been a while, I've g..."


In [4]:
test_song = df.lyrics[0]
test_songs = df.lyrics[:2]

### Tokenization

1. First consider each line its own "sentence", keeping track of blanklines
2. Regexp Tokenizer with the following:  
 - Bracket enclosed texts (usually song part header)
 - All words
 - Any numeric -- keep commas and periods together
 - All other non-whitespace characters
3. Wrap each line with `<s>` and `</s>` tokens
4. Wrap each song with `<d>` and `</d>` tokens (documents)

In [5]:
import nltk.tokenize
import itertools

In [6]:
def tokenize_lyrics(lyrics):
    tk = nltk.tokenize.LineTokenizer(blanklines='keep')
    tokd = tk.tokenize(lyrics)
    
    re_tk = nltk.tokenize.RegexpTokenizer(r'\[[^\]]+\]|\w+|[\d\.,]+|\S+',
                                          discard_empty=False)
    re_tokd = re_tk.tokenize_sents(tokd)
    
    [s.insert(0, '<s>') for s in re_tokd] # insert start token for each line
    [s.append('</s>') for s in re_tokd] # append end token for each line
    
    flat = list(itertools.chain(*re_tokd))
    flat.insert(0, '<d>')
    flat.append('</d>')
    flat = [w.lower() for w in flat]
    return flat

In [7]:
df['tokd'] = df.lyrics.apply(tokenize_lyrics)
df['tokd_len'] = df.tokd.apply(len)
df.head()

Unnamed: 0,msd_id,lyrics,tokd,tokd_len
0,TRAAAAW128F429D538,"\n\n[Verse 1]\nAlright, I might\nHave had a li...","[<d>, <s>, </s>, <s>, </s>, <s>, [verse 1], </...",651
1,TRAAABD128F429CF47,"\n\nDarling, I don't know much\nBut I know I l...","[<d>, <s>, </s>, <s>, </s>, <s>, darling, ,, i...",361
2,TRAAADZ128F9348C2E,\n\nSiento una pena muy honda dentro del alma\...,"[<d>, <s>, </s>, <s>, </s>, <s>, siento, una, ...",127
3,TRAAAEF128F4273421,\n\nAdam Ant/Marco Pirroni\nEvery girl is a so...,"[<d>, <s>, </s>, <s>, </s>, <s>, adam, ant, /m...",322
4,TRAAAFD128F92F423A,"\n\nI've just erased it's been a while, I've g...","[<d>, <s>, </s>, <s>, </s>, <s>, i, 've, just,...",181


In [31]:
tok_test = list(df.tokd[:5])
tok_len_test = list(df.tokd_len[:5])
np.save('../data/interim/subset-10k/tok_test/train_tok.npy', np.asarray(tok_test))
np.save('../data/interim/subset-10k/tok_test/valid_tok.npy', np.asarray(tok_test))
tok_len_test

[651, 361, 127, 322, 181]

In [25]:
# tok_test2 = [s[:100] for s in tok_test]
# np.save('../data/interim/subset-10k/tok_test/train_tok.npy', np.asarray(tok_test2))
# np.save('../data/interim/subset-10k/tok_test/valid_tok.npy', np.asarray(tok_test2))
# print(tok_test2[1])

['<d>', '<s>', '</s>', '<s>', '</s>', '<s>', 'darling', ',', 'i', 'don', "'t", 'know', 'much', '</s>', '<s>', 'but', 'i', 'know', 'i', 'love', 'you', 'so', 'much', '</s>', '<s>', 'my', 'life', 'depends', 'on', 'your', 'touch', '</s>', '<s>', '</s>', '<s>', 'and', 'my', 'love', 'is', 'a', 'river', 'running', 'soul', 'deep', '</s>', '<s>', 'way', 'down', 'inside', 'me', ',', 'it', "'s-a", 'soul', 'deep', '</s>', '<s>', 'it', "'s", 'too', 'big', 'to', 'hide', '</s>', '<s>', 'and', 'it', 'can', "'t", 'be', 'denied', '</s>', '<s>', 'my', 'love', 'is', 'a', 'river', 'running', 'soul', 'deep', '</s>', '<s>', '</s>', '<s>', 'i', "'ll", 'work', 'myself', 'to', 'death', 'for', 'you', '</s>', '<s>', 'just', 'to', 'show', 'i', 'adore']


## ULMFiT Language Model

In [5]:
from pathlib import Path

from fastai import *
from fastai.docs import *
from fastai.text import *

In [32]:
# db = text_data_from_tokens(Path('../data/interim/subset-10k/tok_test'), max_vocab=1000, min_freq=0)
db = text_data_from_tokens(Path('../data/interim/subset-10k/tok_test'), data_func=lm_data, max_vocab=1000, min_freq=0)
# db.train_ds.clear() # this is buggy
# # not sure where this should be checked but sometimes re-creating doesn't do anything because it auto caches

Numericalizing train.
Numericalizing valid.


In [33]:
list(iter(db.train_dl))

[]

In [20]:
x,y = next(iter(db.train_dl))
example = x[:20,:10].cpu()
texts = pd.DataFrame([db.train_ds.vocab.textify(l).split(' ') for l in example])
texts

StopIteration: 

In [334]:
learn = RNNLearner.language_model(db, pretrained_fnames=['lstm_wt103', 'itos_wt103'], drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

VBox(children=(HBox(children=(IntProgress(value=0, max=1), HTML(value='0.00% [0/1 00:00<00:00]'))), HTML(value…

  warn("You generator is empty.")


ValueError: not enough values to unpack (expected at least 1, got 0)

<b>The problem:</b> `text_data_from_tokens` fails to create a DataLoader when used in conjuction with `lm_data` as `data_func`. When the default `data_func` is used, DataLoaders are properly created. Also, `lm_data` works properly when used with `text_data_from_csv`.

Temporary solution is to use the full fastai pipeline for tokenizing and numericalizing... Will need to work out how to fix the above for custom tokenization or spend time seeing if we can do our custom tokenization using fastai pipeline.

In [94]:
asdf = pd.read_csv('../data/interim/subset-10k/csv_test/train.csv', header=None)
asdf[asdf.columns[0]] = len(asdf) * [0]
asdf.head()
asdf.to_csv('../data/interim/subset-10k/csv_test/train.csv', index=False, header=None)

In [6]:
# db_csv = text_data_from_csv(Path('../data/interim/subset-10k/csv_test'),
#                             data_func=lm_data, bs=256,
#                             max_vocab=1000,
#                             min_freq=2)
db_csv = text_data_from_csv(Path('../data/interim/subset-10k/csv_test'),
                            data_func=lm_data, bs=512,
                            max_vocab=1000,
                            min_freq=0)

In [28]:
x,y = next(iter(db_csv.train_dl))
x.shape

torch.Size([72, 256])

In [150]:
# # download_wt103_model() ## this doesn't work.. directory is kind of arbitrary
# model_path = db.path/'models'
# os.makedirs(model_path, exist_ok=True)
# download_url('http://files.fast.ai/models/wt103_v1/lstm_wt103.pth', model_path/'lstm_wt103.pth')
# download_url('http://files.fast.ai/models/wt103_v1/itos_wt103.pkl', model_path/'itos_wt103.pkl')

HBox(children=(IntProgress(value=0, max=221972701), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1027972), HTML(value='')))

In [7]:
learn = RNNLearner.language_model(db_csv, pretrained_fnames=['lstm_wt103', 'itos_wt103'], drop_mult=0.5)
# learn.load('10k-subset-fit-once')

In [29]:
# learn.model

In [8]:
learn.fit(1, 1e-2)

VBox(children=(HBox(children=(IntProgress(value=0, max=1), HTML(value='0.00% [0/1 00:00<00:00]'))), HTML(value…

Total time: 12:49
epoch  train loss  valid loss  accuracy
0      3.222739    3.059252    0.338371  (12:49)



In [13]:
learn.unfreeze()
learn.fit_one_cycle(4, 1e-3)

VBox(children=(HBox(children=(IntProgress(value=0, max=4), HTML(value='0.00% [0/4 00:00<00:00]'))), HTML(value…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
learn.save('10k-subset-5-epoch')

In [14]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/syang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
m = learn.model
s = word_tokenize("love")
t = LongTensor(db_csv.train_ds.vocab.numericalize(s)).view(-1,1).cuda()
t = torch.autograd.Variable(t)

In [43]:
m.eval()
for i in range(200):
    res, *_ = m(t)
    n = res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    word = db_csv.valid_ds.vocab.textify([n.item()])
    print(word, end=' ')
    if word == '<eos>': break
    t = torch.cat((t, n.unsqueeze(0).unsqueeze(0)))

us , and we are the first to be so different , the first to be a part of us , the first to be a part of us , the first to be a part of us , the first to be a part of us , the first to be a part of us , the first to be a part of us , then the first to be done , the first to be done , the first to come , the first , that is to say , the first to come , the first , that is to say , the first , that is to say , the first , that is to say , the first , that is to say , the person to whom we have been so familiar to us , and who is the first to know us , the first to be so different from the first , who , in the first place , have not the same name , as in the first , because we have not been able to say that we are not in the same state of mind as in our own 

In [44]:
# generation with multinomial
# there is some bug with torch library with torch.multinomial
# that throws and exception and make you restart the kernel
# reverting to numpy

m.eval()
for i in range(100):
    res, *_ = m(t)
    # r = np.asarray(res[-1].detach().cpu())
    r = res[-1]
    sm = torch.nn.Softmax(dim=0)
    r = sm(r)
    r = np.asarray(r.detach().cpu())
    n = np.argmax(np.random.multinomial(1, r))
    word = db_csv.valid_ds.vocab.textify([n])
    n = LongTensor([n]).view(-1, 1).cuda()
    print(word, end=' ')
    if word == '<eos>': break
    t = torch.cat((t, n))

, for that matter , which makes a xxunk impression , you must at least xxunk it . now one can think of him only of one ’s xxunk , make him come to a certain xxunk , yet he must not be xxunk by us or even in his xxunk for having always followed him , for he is not worth the trouble , but never one ’s eyes , when one does hand him over , as always to have a certain xxunk of xxunk over one ’s face , of a single small xxunk , xxunk for 

In [27]:
learn.pred_batch()

AttributeError: 'tuple' object has no attribute 'detach'

In [81]:
# learn.save('10k-subset-fit-once')

In [9]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/syang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from nltk import word_tokenize
m = learn.model
s = word_tokenize('what')
t = LongTensor(db_csv.train_ds.vocab.numericalize(s)).view(-1,1).cuda()

In [11]:
# db_csv.train_ds.vocab.textify(t)
res,*_ = m(t)

AttributeError: 'RNNCore' object has no attribute 'hidden'

In [180]:
from nltk import word_tokenize
m = learn.model
s = word_tokenize('<s> what the hell is happening </s> <s> should i put in longer seeds </s>')
t = LongTensor(db.train_ds.vocab.numericalize(s)).view(-1,1).cuda()

In [181]:
for i in range(50):
    res,*_ = m(t)
    n=res[-1].topk(5)[1]
    n = n[1] if n.data[0]==0 else n[0]
    word = db.train_ds.vocab.itos[n.item()]
    print(word, end=' ')
    if word == '<eos>': break
    t = torch.cat((t, n.unsqueeze(0).unsqueeze(0)))

i 't hide girl why schitzo <s> 't hide girl why schitzo <s> 't hide girl why schitzo <s> 't hide girl why schitzo <s> 't hide girl why schitzo <s> 't hide girl why schitzo <s> 't hide girl why schitzo <s> 't hide girl why schitzo <s> 't 

In [179]:
res,*_ = m(t)
n=res[-1].topk(5)[1]
n = n[1] if n.data[0]==0 else n[0]
n

tensor(830, device='cuda:0')

## ELMo Embeddings (TensorFlow Hub)

In [15]:
sess = tf.Session()


In [18]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

writer = tf.summary.FileWriter('../../tmp', sess.graph)

In [46]:
embeddings = elmo(
inputs={
    'tokens': tok_test2,
    'sequence_len': tok_len_test
},
signature="tokens",
as_dict=True)["elmo"]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [47]:
embeddings

<tf.Tensor 'module_2_apply_tokens_1/aggregation/mul_3:0' shape=(5, 100, 1024) dtype=float32>

In [52]:
def ElmoEmbedding(tokens, sequence_length):
    return elmo(inputs={'tokens': tokens,'sequence_len': sequence_length},
                signature="tokens",
                as_dict=True)["default"]

In [60]:
input_tokens = layers.Input(shape=(100,), dtype=tf.string)
input_length = layers.Input(shape=(1,), dtype=tf.int32)
embedding = layers.Lambda(ElmoEmbedding, output_shape=(1024,),
                          arguments={'tokens': input_tokens, 'sequence_length': input_length})
dense = layers.Dense(256, activation='relu')(embedding)
pred = layers.Dense(100, activation='softmax')(dense)

model = Model(inputs=[input_text], outputs=pred)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[keras.metrics.categorical_accuracy])
model.summary()

ValueError: Layer dense_4 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.layers.core.Lambda'>. Full input: [<keras.layers.core.Lambda object at 0x7f4e3ac66780>]. All inputs to the layer should be tensors.

In [43]:
nlp = spacy.load('en')

In [77]:
from spacy.tokenizer import Tokenizer

tokenizer = Tokenizer(nlp.vocab)

tks = tokenizer('this is a sentence')

In [76]:
tks.to_array(spacy.attrs.value)

array([14626626061804382878, 14626626061804382878, 14626626061804382878,
       14626626061804382878], dtype=uint64)

<tf.Tensor 'strided_slice:0' shape=(6, 1024) dtype=float32>