# Latent Dirichlet Allocation

In [2]:
# Reading our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,abu,ac,accent,active,adjust,age,ah,aight,aint,air,...,yeah,yeh,yell,yes,ymca,yo,young,youre,yup,zone
327,0,1,0,0,1,0,4,0,3,0,...,17,0,1,1,0,2,1,1,1,0
Block Party,0,0,0,0,0,0,2,0,1,0,...,0,0,0,1,0,1,0,0,0,0
Dangerookipawaa freestyle,1,0,0,0,0,0,0,0,4,1,...,21,0,0,0,1,1,0,0,0,0
I like it,0,0,1,1,0,0,1,0,0,0,...,2,6,0,0,0,0,0,0,0,0
Leader of Delinquents,0,0,0,0,0,1,0,1,0,0,...,2,0,0,0,0,0,1,0,0,1
Old town road,0,0,0,0,0,0,0,0,1,0,...,3,0,0,0,0,0,0,0,0,0
Shape of you,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Import the necessary modules for LDA with gensim

from gensim import matutils, models
import scipy.sparse


In [4]:
# One of the required inputs is a term-document matrix, NOT document-term matrix, so we transpose
tdm = data.transpose()
tdm.head()

Unnamed: 0,327,Block Party,Dangerookipawaa freestyle,I like it,Leader of Delinquents,Old town road,Shape of you
abu,0,0,1,0,0,0,0
ac,1,0,0,0,0,0,0
accent,0,0,0,1,0,0,0
active,0,0,0,1,0,0,0
adjust,1,0,0,0,0,0,0


In [5]:
##df --> sparse matrix --> gensim corpus

sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [6]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [7]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.018*"yeah" + 0.011*"delinquents" + 0.009*"shit" + 0.009*"gonna" + 0.008*"block" + 0.007*"party" + 0.007*"old" + 0.007*"remind" + 0.007*"horse" + 0.006*"tell"'),
 (1,
  '0.023*"come" + 0.019*"oh" + 0.016*"said" + 0.016*"love" + 0.012*"yeah" + 0.012*"body" + 0.011*"prayed" + 0.009*"ayy" + 0.008*"gang" + 0.008*"paris"')]

In [9]:
 #LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.020*"yeah" + 0.020*"prayed" + 0.017*"ayy" + 0.014*"swear" + 0.014*"paris" + 0.011*"boom" + 0.010*"caseload" + 0.010*"face" + 0.010*"row" + 0.010*"ankles"'),
 (1,
  '0.018*"block" + 0.017*"party" + 0.015*"shit" + 0.013*"remind" + 0.007*"bout" + 0.007*"just" + 0.007*"cause" + 0.007*"dem" + 0.007*"gon" + 0.005*"high"'),
 (2,
  '0.024*"come" + 0.021*"oh" + 0.020*"love" + 0.018*"yeah" + 0.017*"said" + 0.012*"body" + 0.010*"delinquents" + 0.009*"gonna" + 0.009*"tell" + 0.008*"gang"')]

In [10]:
#LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.036*"come" + 0.025*"love" + 0.024*"oh" + 0.018*"body" + 0.017*"yeah" + 0.017*"prayed" + 0.014*"ayy" + 0.012*"paris" + 0.012*"swear" + 0.009*"boom"'),
 (1,
  '0.030*"yeah" + 0.021*"delinquents" + 0.011*"leader" + 0.008*"mm" + 0.008*"uh" + 0.007*"love" + 0.006*"shit" + 0.006*"bitch" + 0.006*"bout" + 0.006*"nigga"'),
 (2,
  '0.021*"block" + 0.019*"party" + 0.017*"shit" + 0.015*"remind" + 0.008*"cause" + 0.008*"just" + 0.008*"bout" + 0.008*"dem" + 0.008*"gon" + 0.006*"wanna"'),
 (3,
  '0.036*"said" + 0.020*"gonna" + 0.018*"tell" + 0.017*"gang" + 0.013*"woo" + 0.011*"oh" + 0.011*"horse" + 0.011*"nothin" + 0.011*"town" + 0.010*"til"')]

In [11]:
#LDA for num_topics = 5
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=10)
lda.print_topics()

[(0,
  '0.023*"yeah" + 0.020*"prayed" + 0.016*"ayy" + 0.014*"gonna" + 0.014*"paris" + 0.014*"swear" + 0.012*"nothin" + 0.011*"tell" + 0.011*"boom" + 0.010*"horse"'),
 (1,
  '0.049*"said" + 0.023*"gang" + 0.017*"woo" + 0.012*"oh" + 0.012*"enjoys" + 0.012*"yeh" + 0.010*"tell" + 0.010*"whats" + 0.008*"bitch" + 0.008*"thing"'),
 (2,
  '0.023*"block" + 0.021*"party" + 0.019*"shit" + 0.017*"remind" + 0.009*"bout" + 0.009*"gon" + 0.009*"cause" + 0.009*"dem" + 0.009*"just" + 0.007*"high"'),
 (3,
  '0.033*"yeah" + 0.023*"delinquents" + 0.011*"leader" + 0.009*"uh" + 0.009*"mm" + 0.007*"love" + 0.006*"aint" + 0.006*"nigga" + 0.006*"bitch" + 0.006*"shit"'),
 (4,
  '0.081*"come" + 0.055*"love" + 0.053*"oh" + 0.038*"body" + 0.014*"day" + 0.014*"new" + 0.014*"brand" + 0.014*"discovering" + 0.014*"shape" + 0.014*"follow"')]

Topics are not looking that great. We will try another trick. 

Let us only look at terms that are from one part of speech (only nouns, only adjectives, etc.).
Check out the UPenn tag set: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [18]:
# Let's create a function to pull out verbs from a string of text
import nltk
from nltk import word_tokenize, pos_tag

def verbs(text):
    '''Given a string of text, tokenize the text and pull out only the verbs.'''
    is_verb = lambda pos: pos[:2] == 'VB'or 'VBD'or 'VBG' or 'VBN' or 'VBP' or 'VBZ'
    tokenized = word_tokenize(text)
    all_verbs = [word for (word, pos) in pos_tag(tokenized) if is_verb(pos)] 
    return ' '.join(all_nouns)

In [19]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,Lyrics
327,\nlook i swear paris will be prayed for prayed...
Block Party,\nstatik selektah\nevery time i hear a dope ba...
Dangerookipawaa freestyle,\n\n\nyeah\nyeah\nyeah ayy man listen\n\n\nsou...
I like it,\nyeah baby i like it like that\nyou gotta bel...
Leader of Delinquents,\nmm yeah mm mm\nthis is not a test\nmm check\...
Old town road,\noh ohoh\noh\n\n\nyeah im gonna take my horse...
Shape of you,\nthe club isnt the best place to find a lover...


In [46]:
'''This is not working. I need to figure this one out.'''
'''def nouns_adj(text):
    ###Given a string of text, tokenize the text and pull out only the nouns and adjectives.###
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)'''

In [48]:
'''# Apply the nouns function to the lyrics to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.Lyrics.apply(nouns_adj))
data_nouns_adj'''

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/prabeshkoirala/nltk_data'
    - '/Users/prabeshkoirala/opt/anaconda3/nltk_data'
    - '/Users/prabeshkoirala/opt/anaconda3/share/nltk_data'
    - '/Users/prabeshkoirala/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [51]:
'''# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna'''

NameError: name 'CountVectorizer' is not defined

In [None]:
'''# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())'''

In [None]:
'''# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()'''

In [None]:
'''# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()'''

In [44]:
#LDA for num_topics = 6
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=200)
lda.print_topics()

[(0,
  '0.030*"yeah" + 0.013*"prayed" + 0.013*"ayy" + 0.012*"delinquents" + 0.009*"paris" + 0.009*"swear" + 0.008*"face" + 0.007*"boom" + 0.007*"shit" + 0.006*"uh"'),
 (1,
  '0.035*"come" + 0.030*"oh" + 0.025*"love" + 0.025*"said" + 0.017*"body" + 0.014*"tell" + 0.014*"gonna" + 0.012*"gang" + 0.009*"woo" + 0.008*"horse"'),
 (2,
  '0.018*"block" + 0.017*"party" + 0.015*"shit" + 0.013*"remind" + 0.007*"just" + 0.007*"bout" + 0.007*"cause" + 0.007*"gon" + 0.007*"dem" + 0.005*"wanna"')]

In [53]:
'''# Let's take a look at which topics each lyrics contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))'''

NameError: name 'corpusna' is not defined