In [1]:
## download songlyrics.zip from Kaggle, requires log in
## https://www.kaggle.com/mousehead/songlyrics/downloads/songlyrics.zip

In [14]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from gensim.models.word2vec import Word2Vec
from nltk import sent_tokenize, word_tokenize
from sklearn.manifold import TSNE

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
data = pd.read_csv('./data/kaggle_lyrics/songlyrics.zip')
data.drop(columns=['link'], inplace=True)
data.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [5]:
## use kaggle data set on 57,650 songs (check if it has artists for text generation)

artist_names = ['Eric Clapton', 'Jimi Hendrix', 'Bob Dylan', 'Muddy Waters', 'Eagles']
for artist_name in artist_names:
    print('There are {:03d} songs set for {}.'.format(
        len(data[data['artist'].str.contains(artist_name)]), artist_name))

There are 152 songs set for Eric Clapton.
There are 127 songs set for Jimi Hendrix.
There are 188 songs set for Bob Dylan.
There are 000 songs set for Muddy Waters.
There are 041 songs set for Eagles.


In [6]:
data['text'][0].replace('\n', '. ')

"Look at her face, it's a wonderful face  . And it means something special to me  . Look at the way that she smiles when she sees me  . How lucky can one fellow be?  .   . She's just my kind of girl, she makes me feel fine  . Who could ever believe that she could be mine?  . She's just my kind of girl, without her I'm blue  . And if she ever leaves me what could I do, what could I do?  .   . And when we go for a walk in the park  . And she holds me and squeezes my hand  . We'll go on walking for hours and talking  . About all the things that we plan  .   . She's just my kind of girl, she makes me feel fine  . Who could ever believe that she could be mine?  . She's just my kind of girl, without her I'm blue  . And if she ever leaves me what could I do, what could I do?. . "

In [7]:
%%time
data['text'] = data['text'].apply(lambda text: word_tokenize(text.replace('\n', '. ')))

CPU times: user 4min 16s, sys: 1.65 s, total: 4min 18s
Wall time: 4min 18s


In [8]:
%%time
## generate embeddings
## build vocabulary and train model
model = Word2Vec(data['text'].tolist(), size=128, window=10, min_count=50, workers=6, sg=0, hs=0)
model.train(data['text'].tolist(), total_examples=len(data['text']), epochs=10)
# note: sg=0,1 (skip gram or cbow by default)
# note: hs=0,1 (hierarchical softmax or negative sampling by default)

2018-09-05 08:00:39,131 : INFO : collecting all words and their counts
2018-09-05 08:00:39,132 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-09-05 08:00:39,648 : INFO : PROGRESS: at sentence #10000, processed 2843150 words, keeping 46342 word types
2018-09-05 08:00:40,179 : INFO : PROGRESS: at sentence #20000, processed 5778998 words, keeping 69244 word types
2018-09-05 08:00:40,692 : INFO : PROGRESS: at sentence #30000, processed 8605139 words, keeping 86151 word types
2018-09-05 08:00:41,250 : INFO : PROGRESS: at sentence #40000, processed 11562789 words, keeping 101802 word types
2018-09-05 08:00:41,841 : INFO : PROGRESS: at sentence #50000, processed 14573663 words, keeping 117756 word types
2018-09-05 08:00:42,273 : INFO : collected 127646 word types from a corpus of 16830688 raw words and 57650 sentences
2018-09-05 08:00:42,274 : INFO : Loading a fresh vocabulary
2018-09-05 08:00:42,371 : INFO : effective_min_count=50 retains 8573 unique words (6

2018-09-05 08:01:16,086 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-05 08:01:16,087 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-05 08:01:16,092 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-05 08:01:16,095 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-05 08:01:16,096 : INFO : EPOCH - 5 : training on 16830688 raw words (10571750 effective words) took 6.8s, 1558537 effective words/s
2018-09-05 08:01:16,096 : INFO : training on a 84153440 raw words (52853404 effective words) took 33.6s, 1575003 effective words/s
2018-09-05 08:01:16,100 : INFO : training model with 6 workers on 8573 vocabulary and 128 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2018-09-05 08:01:17,106 : INFO : EPOCH 1 - PROGRESS: at 14.81% examples, 1513575 words/s, in_qsize 11, out_qsize 0
2018-09-05 08:01:18,107 : INFO : EPOCH 1 - PROGRESS: at 28.75% examples, 1509852 words/s, in_qsize 1

2018-09-05 08:01:55,098 : INFO : EPOCH 6 - PROGRESS: at 74.77% examples, 1573314 words/s, in_qsize 11, out_qsize 0
2018-09-05 08:01:56,105 : INFO : EPOCH 6 - PROGRESS: at 90.29% examples, 1574928 words/s, in_qsize 10, out_qsize 1
2018-09-05 08:01:56,740 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-09-05 08:01:56,742 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-09-05 08:01:56,745 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-05 08:01:56,746 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-05 08:01:56,749 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-05 08:01:56,753 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-05 08:01:56,753 : INFO : EPOCH - 6 : training on 16830688 raw words (10568488 effective words) took 6.7s, 1580695 effective words/s
2018-09-05 08:01:57,761 : INFO : EPOCH 7 - PROGRESS: at 15.58% examples, 1601750 wor

CPU times: user 6min 48s, sys: 4.73 s, total: 6min 53s
Wall time: 1min 43s


In [9]:
model.wv.most_similar(positive=['man', 'cold'], negative=[], topn=5)

2018-09-05 08:02:22,790 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.5012178421020508),
 ('Cold', 0.48644325137138367),
 ('hunter', 0.4433850944042206),
 ('heartless', 0.43375563621520996),
 ('Dom', 0.433209091424942)]

In [10]:
## build vocab and embedding matrix
vocab = list(model.wv.vocab)
embedding_matrix = model.wv[vocab] # shape = [vocab_size, embedding_size]
print(len(vocab))

8573


In [11]:
def find_clustered_embeddings(embeddings, distance_threshold, sample_threshold):
    ''' 
    Find only the closely clustered embeddings. 
    This gets rid of more sparsly distributed word embeddings and make the visualization clearer
    This is useful for t-SNE visualization
    
    distance_threshold: maximum distance between two points to qualify as neighbors
    sample_threshold: number of neighbors required to be considered a cluster
    '''
    
    # calculate cosine similarity
    cosine_sim = np.dot(embeddings,np.transpose(embeddings))
    norm = np.dot(np.sum(embeddings**2,axis=1).reshape(-1,1),np.sum(np.transpose(embeddings)**2,axis=0).reshape(1,-1))
    assert cosine_sim.shape == norm.shape
    cosine_sim /= norm
    
    # make all the diagonal entries zero otherwise this will be picked as highest
    np.fill_diagonal(cosine_sim, -1.0)
    
    argmax_cos_sim = np.argmax(cosine_sim, axis=1)
    mod_cos_sim = cosine_sim
    # find the maximums in a loop to count if there are more than n items above threshold
    for _ in range(sample_threshold-1):
        argmax_cos_sim = np.argmax(cosine_sim, axis=1)
        mod_cos_sim[np.arange(mod_cos_sim.shape[0]),argmax_cos_sim] = -1
    
    max_cosine_sim = np.max(mod_cos_sim,axis=1)

    return np.where(max_cosine_sim>distance_threshold)[0]

In [40]:
%%time
## tsne and visualize
num_embeddings = len(vocab)
selected_embeddings = embedding_matrix[:num_embeddings,:]

tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, verbose=1).fit_transform(selected_embeddings)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 8573 samples in 0.038s...
[t-SNE] Computed neighbors for 8573 samples in 18.252s...
[t-SNE] Computed conditional probabilities for sample 1000 / 8573
[t-SNE] Computed conditional probabilities for sample 2000 / 8573
[t-SNE] Computed conditional probabilities for sample 3000 / 8573
[t-SNE] Computed conditional probabilities for sample 4000 / 8573
[t-SNE] Computed conditional probabilities for sample 5000 / 8573
[t-SNE] Computed conditional probabilities for sample 6000 / 8573
[t-SNE] Computed conditional probabilities for sample 7000 / 8573
[t-SNE] Computed conditional probabilities for sample 8000 / 8573
[t-SNE] Computed conditional probabilities for sample 8573 / 8573
[t-SNE] Mean sigma: 2.946026
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.207901
[t-SNE] Error after 2500 iterations: 3.016892
CPU times: user 13min 49s, sys: 52.5 s, total: 14min 41s
Wall time: 14min 47s


In [45]:
print('Pruning the T-SNE embeddings')
# prune the embeddings by getting ones only more than n-many sample above the similarity threshold
# this unclutters the visualization
selected_ids = find_clustered_embeddings(selected_embeddings, 0.01, 1)
tsne_plot = tsne[selected_ids,:]

print('Out of ', num_embeddings, ' samples, ', selected_ids.shape[0],' samples were selected by pruning')

Pruning the T-SNE embeddings
Out of  8573  samples,  372  samples were selected by pruning


In [None]:
# see plotting code here: 
# https://github.com/PacktPublishing/Natural-Language-Processing-with-TensorFlow/blob/master/ch3/ch3_word2vec.ipynb