In [1]:
## download songlyrics.zip from Kaggle, requires log in
## https://www.kaggle.com/mousehead/songlyrics/downloads/songlyrics.zip

In [2]:
import logging
import matplotlib.pyplot as plt
import pandas as pd

from gensim.models.word2vec import Word2Vec
from nltk import sent_tokenize, word_tokenize
from sklearn.manifold import TSNE

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
data = pd.read_csv('./data/kaggle_lyrics/songlyrics.zip')
data.drop(columns=['link'], inplace=True)
data.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [5]:
## use kaggle data set on 57,650 songs (check if it has artists for text generation)

artist_names = ['Eric Clapton', 'Jimi Hendrix', 'Bob Dylan', 'Muddy Waters', 'Eagles']
for artist_name in artist_names:
    print('There are {:03d} songs set for {}.'.format(
        len(data[data['artist'].str.contains(artist_name)]), artist_name))

There are 152 songs set for Eric Clapton.
There are 127 songs set for Jimi Hendrix.
There are 188 songs set for Bob Dylan.
There are 000 songs set for Muddy Waters.
There are 041 songs set for Eagles.


In [6]:
data['text'][0].replace('\n', '. ')

"Look at her face, it's a wonderful face  . And it means something special to me  . Look at the way that she smiles when she sees me  . How lucky can one fellow be?  .   . She's just my kind of girl, she makes me feel fine  . Who could ever believe that she could be mine?  . She's just my kind of girl, without her I'm blue  . And if she ever leaves me what could I do, what could I do?  .   . And when we go for a walk in the park  . And she holds me and squeezes my hand  . We'll go on walking for hours and talking  . About all the things that we plan  .   . She's just my kind of girl, she makes me feel fine  . Who could ever believe that she could be mine?  . She's just my kind of girl, without her I'm blue  . And if she ever leaves me what could I do, what could I do?. . "

In [7]:
%%time
data['text'] = data['text'].apply(lambda text: word_tokenize(text.replace('\n', '. ')))

CPU times: user 4min 21s, sys: 1.9 s, total: 4min 23s
Wall time: 4min 24s


In [8]:
%%time
## generate embeddings
## build vocabulary and train model
model = Word2Vec(data['text'].tolist(), size=128, window=10, min_count=50, workers=6, sg=0, hs=0)
model.train(data['text'].tolist(), total_examples=len(data['text']), epochs=10)
# note: sg=0,1 (skip gram or cbow by default)
# note: hs=0,1 (hierarchical softmax or negative sampling by default)

2018-09-04 22:05:00,165 : INFO : collecting all words and their counts
2018-09-04 22:05:00,166 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-09-04 22:05:00,668 : INFO : PROGRESS: at sentence #10000, processed 2843150 words, keeping 46342 word types
2018-09-04 22:05:01,194 : INFO : PROGRESS: at sentence #20000, processed 5778998 words, keeping 69244 word types
2018-09-04 22:05:01,710 : INFO : PROGRESS: at sentence #30000, processed 8605139 words, keeping 86151 word types
2018-09-04 22:05:02,257 : INFO : PROGRESS: at sentence #40000, processed 11562789 words, keeping 101802 word types
2018-09-04 22:05:02,807 : INFO : PROGRESS: at sentence #50000, processed 14573663 words, keeping 117756 word types
2018-09-04 22:05:03,222 : INFO : collected 127646 word types from a corpus of 16830688 raw words and 57650 sentences
2018-09-04 22:05:03,223 : INFO : Loading a fresh vocabulary
2018-09-04 22:05:03,322 : INFO : effective_min_count=50 retains 8573 unique words (6

2018-09-04 22:05:34,877 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-04 22:05:34,878 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-04 22:05:34,879 : INFO : EPOCH - 5 : training on 16830688 raw words (10571042 effective words) took 6.5s, 1618055 effective words/s
2018-09-04 22:05:34,880 : INFO : training on a 84153440 raw words (52855671 effective words) took 31.4s, 1684167 effective words/s
2018-09-04 22:05:34,884 : INFO : training model with 6 workers on 8573 vocabulary and 128 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2018-09-04 22:05:35,893 : INFO : EPOCH 1 - PROGRESS: at 15.77% examples, 1618266 words/s, in_qsize 11, out_qsize 0
2018-09-04 22:05:36,895 : INFO : EPOCH 1 - PROGRESS: at 31.02% examples, 1627012 words/s, in_qsize 12, out_qsize 1
2018-09-04 22:05:37,896 : INFO : EPOCH 1 - PROGRESS: at 46.86% examples, 1628294 words/s, in_qsize 10, out_qsize 1
2018-09-04 22:05:38,905 : INFO : EPOCH 1 - PROGRESS:

2018-09-04 22:06:14,167 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-09-04 22:06:14,169 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-09-04 22:06:14,172 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-04 22:06:14,174 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-04 22:06:14,180 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-04 22:06:14,181 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-04 22:06:14,182 : INFO : EPOCH - 6 : training on 16830688 raw words (10570578 effective words) took 6.6s, 1605537 effective words/s
2018-09-04 22:06:15,187 : INFO : EPOCH 7 - PROGRESS: at 15.43% examples, 1594693 words/s, in_qsize 10, out_qsize 1
2018-09-04 22:06:16,191 : INFO : EPOCH 7 - PROGRESS: at 30.91% examples, 1622623 words/s, in_qsize 10, out_qsize 1
2018-09-04 22:06:17,198 : INFO : EPOCH 7 - PROGRESS: at 47.07% examples, 1632700 wor

CPU times: user 6min 44s, sys: 4.83 s, total: 6min 49s
Wall time: 1min 39s


In [9]:
model.wv.most_similar(positive=['man', 'cold'], negative=[], topn=5)

2018-09-04 22:06:40,079 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.5074859857559204),
 ('Cold', 0.5048136115074158),
 ('blooded', 0.4648822247982025),
 ('politician', 0.46068206429481506),
 ('Dom', 0.4527609944343567)]

In [12]:
## build vocab and embedding matrix
vocab = list(model.wv.vocab)
embedding_matrix = model.wv[vocab] # shape = [vocab_size, embedding_size]
print(len(vocab))

8573


In [11]:
%%time
## tsne and visualize
tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, verbose=1).fit_transform(embedding_matrix)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 8573 samples in 0.041s...
[t-SNE] Computed neighbors for 8573 samples in 18.372s...
[t-SNE] Computed conditional probabilities for sample 1000 / 8573
[t-SNE] Computed conditional probabilities for sample 2000 / 8573
[t-SNE] Computed conditional probabilities for sample 3000 / 8573
[t-SNE] Computed conditional probabilities for sample 4000 / 8573
[t-SNE] Computed conditional probabilities for sample 5000 / 8573
[t-SNE] Computed conditional probabilities for sample 6000 / 8573
[t-SNE] Computed conditional probabilities for sample 7000 / 8573
[t-SNE] Computed conditional probabilities for sample 8000 / 8573
[t-SNE] Computed conditional probabilities for sample 8573 / 8573
[t-SNE] Mean sigma: 2.945211
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.022102
[t-SNE] Error after 2500 iterations: 3.014133
CPU times: user 13min 29s, sys: 49.3 s, total: 14min 18s
Wall time: 14min 20s
