In [36]:
import gensim
import csv
import re
from collections import defaultdict

In [37]:
hdrs = None
data = None

# parse data from song features
with open("billboard_history_lyrics.csv", "r") as f:
    reader = csv.reader(f)
    all_rows = list(reader)
    hdrs = all_rows[0]
    data = all_rows[1:]

# remove all non-alphanumeric characters
all_lyrics = [[re.sub(r'\W+', '', w)  for w in d[3].split()] for d in data]

In [38]:
# remove common words
stoplist = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']
all_lyrics = [[word for word in lyric if word not in stoplist] for lyric in all_lyrics]

# remove words that only occur once
frequency = defaultdict(int)
for lyric in all_lyrics:
    for token in lyric:
        frequency[token] += 1

all_lyrics = [[word for word in lyric if frequency[word] > 1] for lyric in all_lyrics]

In [39]:
id2word = gensim.corpora.dictionary.Dictionary(all_lyrics)
corpus = [id2word.doc2bow(lyric) for lyric in all_lyrics]

In [40]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, update_every=1, chunksize=1, passes=1)

In [41]:
lda.print_topics(10)

[(0,
  '0.088*"down" + 0.038*"girl" + 0.037*"hit" + 0.034*"let" + 0.022*"cash" + 0.022*"we" + 0.021*"no" + 0.020*"hot" + 0.019*"while" + 0.019*"mmm"'),
 (1,
  '0.081*"you" + 0.080*"i" + 0.054*"it" + 0.038*"me" + 0.029*"dont" + 0.022*"that" + 0.020*"know" + 0.016*"tell" + 0.016*"even" + 0.015*"with"'),
 (2,
  '0.130*"up" + 0.047*"on" + 0.047*"my" + 0.030*"all" + 0.028*"need" + 0.027*"me" + 0.022*"ready" + 0.020*"name" + 0.020*"what" + 0.018*"got"'),
 (3,
  '0.079*"yeah" + 0.062*"i" + 0.052*"my" + 0.040*"on" + 0.037*"just" + 0.032*"they" + 0.028*"make" + 0.028*"got" + 0.027*"with" + 0.025*"wanna"'),
 (4,
  '0.094*"god" + 0.045*"bank" + 0.039*"woke" + 0.035*"one" + 0.028*"call" + 0.027*"drake" + 0.018*"dj" + 0.017*"big" + 0.017*"not" + 0.015*"friends"'),
 (5,
  '0.081*"loyalty" + 0.063*"dog" + 0.045*"ooh" + 0.034*"lot" + 0.024*"dope" + 0.015*"lee" + 0.015*"kendrick" + 0.014*"lamar" + 0.013*"rhythm" + 0.013*"kill"'),
 (6,
  '0.114*"em" + 0.080*"niggas" + 0.043*"side" + 0.023*"hook" + 0.021