In [2]:
import pandas as pd
import random
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short
from gensim.models.ldamodel import LdaModel
from collections import Counter

In [3]:
song_df=pd.read_csv("../Data/songdata_clean.csv",sep=',', encoding='utf-8', usecols=['artist','song','lyrics'])
song_df.dropna(inplace=True)
print(len(song_df))
song_df.head()

56148


Unnamed: 0,artist,song,lyrics
0,ABBA,She's My Kind Of Girl,"Look at her face, it's a wonderful face \r\r\..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\r\r\r\r\r\r\r..."
2,ABBA,As Good As New,I'll never know why I had to go \r\r\r\r\r\r\...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


# Latent-Dirichlet Allocation (LDA)
LDA is common topic modelling technique that embed word-counts and document term matrices to identify a specified number of topics in a corpus. This notebook implements LDA for the lyrics dataset.

## Clean documents
Lyrics are different than novels. Slang and the particular way an artist writes or sings is important. Because of this we will not stem (keeping only the root of the word) the words in our corpus.

In [4]:
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]
doc_clean = [preprocess_string(doc, filters=CUSTOM_FILTERS) for doc in song_df.lyrics]

## Create document term matrix

In [5]:
dictionary = corpora.Dictionary(doc_clean)
%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
song_df['bow']=doc_term_matrix

Wall time: 2.81 s


## Create Training set
LDA takes a while to run so we will train on a random subset of the data.

In [6]:
train_corpus=random.sample(doc_clean,1000)
#train_corpus=doc_clean

## Train LDA model
We will assume 20 possible topics across the dataset (num_topics=20).

In [7]:
%time ldamodel = LdaModel(doc_term_matrix, num_topics=20, id2word = dictionary, passes=50)  

Wall time: 1h 8min 8s


In [8]:
ldamodel.save('../Data/lda.model')

# Topic Modelling

In [9]:
ldamodel=LdaModel.load('../Data/lda.model')

In [10]:
topics=ldamodel.top_topics(doc_term_matrix, topn=6)
#topn is the number of words used to describe a topic
[(i,[x[1] for x in t[0]]) for i,t in enumerate(topics)]

[(0, ['know', 'time', 'way', 'life', 'away', 'feel']),
 (1, ['nigga', 'like', 'ain', 'cause', 'shit', 'fuck']),
 (2, ['baby', 'know', 'gonna', 'girl', 'got', 'wanna']),
 (3, ['love', 'heart', 'need', 'hold', 'like', 'forever']),
 (4, ['said', 'old', 'man', 'got', 'new', 'boy']),
 (5, ['sun', 'sky', 'rain', 'blue', 'like', 'run']),
 (6, ['night', 'eyes', 'home', 'long', 'gone', 'dream']),
 (7, ['let', 'come', 'tonight', 'night', 'dance', 'everybody']),
 (8, ['like', 'look', 'going', 'people', 'better', 'round']),
 (9, ['god', 'lord', 'sing', 'heaven', 'song', 'jesus']),
 (10, ['got', 'bitch', 'rock', 'roll', 'ready', 'ride']),
 (11, ['yeah', 'hey', 'gotta', 'whoa', 'alright', 'stop']),
 (12, ['die', 'hell', 'blood', 'death', 'war', 'wit']),
 (13, ['bad', 'somebody', 'wish', 'yes', 'dead', 'save']),
 (14, ['happy', 'christmas', 'bring', 'bye', 'days', 'year']),
 (15, ['little', 'bit', 'miss', 'sister', 'wait', 'piece']),
 (16, ['ooh', 'mama', 'beautiful', 'blues', 'doo', 'chicken']),
 (1

From each set of topic words, we can identify a theme.

## Get topics for each song

In [13]:
def get_topic(doc, model):
    try:
        return sorted(model[doc], key=lambda x: x[1])[0][0]
    except:
        return None

For example, take a random song and identify the topic code. To see what the song is about, we then cross-reference this number with the topic words above.

In [14]:
i=random.randint(1, len(song_df))
print(song_df.lyrics[i])
get_topic(song_df.bow[i], ldamodel)

Dark night, there is no light  
In the realm of the black magic man  
Soul's flight into the cold blight  
Of the destroyer's magic land  
  
Poor man, whose spirits are stronger  
They're the ones who will reign  
You're struggles are in vain  
  
Blind man, you're suckin' your own blood  
Soon black magic's dying  
You'd better start crying  
  
Blind man, you're suckin' your own blood  
Soon black magic's dying  
You'd better start crying  
  
Throw out your evil desire  
The dark king's kingdom is  
Made out of mire  
  
Throw out your evil desire  
The dark king's kingdom is  
Made out of mire  
  
Keep on for the kingdom of light  
There is no darkness, there is no night




13

In [15]:
song_df['topic']=[[]]*len(song_df)

for i,doc in song_df.bow.iteritems():
    song_df.at[i,'topic']=get_topic(doc, ldamodel)
song_df.head()

Unnamed: 0,artist,song,lyrics,bow,topic
0,ABBA,She's My Kind Of Girl,"Look at her face, it's a wonderful face \r\r\...","[(0, 2), (1, 2), (2, 2), (3, 2), (4, 1), (5, 2...",15
1,ABBA,"Andante, Andante","Take it easy with me, please \r\r\r\r\r\r\r\r...","[(28, 20), (29, 1), (30, 1), (31, 1), (32, 1),...",10
2,ABBA,As Good As New,I'll never know why I had to go \r\r\r\r\r\r\...,"[(3, 1), (12, 1), (26, 2), (33, 2), (43, 2), (...",7
3,ABBA,Bang,Making somebody happy is a question of give an...,"[(3, 1), (12, 1), (29, 2), (37, 1), (43, 1), (...",8
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,"[(3, 1), (12, 1), (29, 2), (37, 1), (43, 1), (...",4


In [16]:
Counter(song_df.topic)

Counter({0: 4804,
         1: 2837,
         2: 2824,
         3: 5621,
         4: 1305,
         5: 1862,
         6: 1348,
         7: 3138,
         8: 615,
         9: 2521,
         10: 2130,
         11: 1988,
         12: 2542,
         13: 3017,
         14: 1586,
         15: 4017,
         16: 4194,
         17: 4609,
         18: 1632,
         19: 3558})

Let us take a look as songs about topic 3:

In [19]:
song_df.query('topic=={}'.format(3)).loc[:,['artist','song','lyrics']]

Unnamed: 0,artist,song,lyrics
6,ABBA,Cassandra,Down in the street they're all singing and sho...
31,ABBA,Hey Hey Helen,So at last you're free \r\r\r\r\r\r\r\r\r\r\r...
63,ABBA,Move On,They say a restless body can hide a peaceful s...
73,ABBA,Put On Your White Sombrero,Put on your white sombrero \r\r\r\r\r\r\r\r\r...
92,ABBA,That's Me,Are you sure you want to hear more \r\r\r\r\r...
122,Ace Of Base,Never Gonna Say I'm Sorry,"I'm never gonna say I'm sorry, \r\r\r\r\r\r\r..."
135,Adele,Crazy For You,"Found myself today singing out your name, \r\..."
136,Adele,Daydreamer,Daydreamer \r\r\r\r\r\r\r\r\r\r\r\r\r\nSittin...
139,Adele,Melt My Heart To Stone,Right under my feet there's air made of bricks...
156,Aerosmith,Lay It Down,Ruby red... her lips were on fire \r\r\r\r\r\...


One of the big differences between LDA and doc2vec approaches is that doc2vec allows us to choose our topics. LDA only allows the choice of number of topics.