# LDA Topic Model (billboard_lyrics)

In [19]:
import nltk
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer    # stemmer : 어간추출기
from gensim import corpora, models
import gensim  

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import re
import pandas as pd
import string

In [4]:
df = pd.read_csv('/Users/juhyeon/python-workspace/billboard_lyrics_1964-2015.csv', encoding="ISO-8859-1")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rank    5100 non-null   int64  
 1   Song    5100 non-null   object 
 2   Artist  5100 non-null   object 
 3   Year    5100 non-null   int64  
 4   Lyrics  4913 non-null   object 
 5   Source  4913 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 239.2+ KB


In [6]:
df.head(5)

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0


# Data Loading & Pre-processing

In [7]:
df_lyrics = df[['Lyrics']]
df_lyrics.head(5)

Unnamed: 0,Lyrics
0,sam the sham miscellaneous wooly bully wooly b...
1,sugar pie honey bunch you know that i love yo...
2,
3,when i woke up this morning you were on my mi...
4,you never close your eyes anymore when i kiss...


In [8]:
# 5100개의 Lyrics가 'docs'에 list로 묶여서 저장됨
docs = list(df_lyrics['Lyrics']) 

print(len(docs), type(docs))
print(docs[0:2])

5100 <class 'list'>
['sam the sham miscellaneous wooly bully wooly bully sam the sham  the pharaohs  domingo samudio uno dos one two tres quatro matty told hatty about a thing she saw had two big horns and a wooly jaw wooly bully wooly bully wooly bully wooly bully wooly bully hatty told matty lets dont take no chance lets not belseven come and learn to dance wooly bully wooly bully wooly bully wooly bully wooly bully matty told hatty thats the thing to do get you someone really to pull the wool with you wooly bully wooly bully wooly bully wooly bully wooly bully lseven  the letter l and the number 7 when typed they form a rough square l7 so the lyrics mean lets not be square', ' sugar pie honey bunch you know that i love you i cant help myself i love you and nobody elsein and out my life you come and you go leaving just your picture behind and i kissed it a thousand timeswhen you snap your finger or wink your eye i come arunning to you im tied to your apron strings and theres nothing 

In [9]:
# docs안의 문서를 읽어서 전처리 후 다시 저장
tokenizer = RegexpTokenizer(r'\w+')   # sentence -> word (space를 만나면 tokenize)
stop = get_stop_words('en')           # english stopwords list
stemmer = PorterStemmer()             # 전처리 후 문서저장
texts = []

for d in docs:
    if(d):
        txt = re.sub(r'\d', '', str(d))                             # 숫자제거
        raw = txt.lower()                                           # 소문자화
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in stop]       # stopword가 아닌 token 골라내기
        stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]  # token의 어간만 추출 (ex. spends -> spend)
        real_tokens = [t for t in stemmed_tokens if len(t) > 2]     # 'longer than 2' token만 골라내기
        texts.append(real_tokens)                                   # 걸러진 token들을 'texts' list에 저장!

In [10]:
print(texts[0])
print(type(texts))
print(len(texts))

['sam', 'sham', 'miscellan', 'wooli', 'bulli', 'wooli', 'bulli', 'sam', 'sham', 'pharaoh', 'domingo', 'samudio', 'uno', 'one', 'two', 'tre', 'quatro', 'matti', 'told', 'hatti', 'thing', 'saw', 'two', 'big', 'horn', 'wooli', 'jaw', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'hatti', 'told', 'matti', 'let', 'dont', 'take', 'chanc', 'let', 'belseven', 'come', 'learn', 'danc', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'matti', 'told', 'hatti', 'that', 'thing', 'get', 'someon', 'realli', 'pull', 'wool', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'wooli', 'bulli', 'lseven', 'letter', 'number', 'type', 'form', 'rough', 'squar', 'lyric', 'mean', 'let', 'squar']
<class 'list'>
5100


In [11]:
# pre-processing을 완료한 lyrics들을 dataframe화. len(tokenized_doc)=5100개 확인!
tokenized_doc = pd.DataFrame({"tokened_Lyrics":texts})
tokenized_doc.head(5)

Unnamed: 0,tokened_Lyrics
0,"[sam, sham, miscellan, wooli, bulli, wooli, bu..."
1,"[sugar, pie, honey, bunch, know, love, cant, h..."
2,[]
3,"[woke, morn, mind, mind, got, troubl, whoaoh, ..."
4,"[never, close, eye, anymor, kiss, lip, there, ..."


In [12]:
len(tokenized_doc['tokened_Lyrics'][2])

0

In [13]:
len(tokenized_doc)

5100

In [15]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
empty_cnt = 0

for i in range(len(tokenized_doc)):
    if len(tokenized_doc['tokened_Lyrics'][i]) == 0:        # 공백이면, dataframe에 공백인 행으로 저장 
        emp = ' '
        detokenized_doc.append(emp)
        empty_cnt += 1       
    else:
        t = ' '.join(tokenized_doc['tokened_Lyrics'][i])    # 공백이 아니면, 개별 단어들을 공백으로 묶어서 저장 
        detokenized_doc.append(t)

# 다시 df_lyrics['Lyrics']에 저장
df_lyrics['Lyrics'] = detokenized_doc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lyrics['Lyrics'] = detokenized_doc


In [16]:
df_lyrics.head(5)

Unnamed: 0,Lyrics
0,sam sham miscellan wooli bulli wooli bulli sam...
1,sugar pie honey bunch know love cant help love...
2,
3,woke morn mind mind got troubl whoaoh got worr...
4,never close eye anymor kiss lip there tender l...


# Vectorization

In [17]:
# 상위 1,000개의 단어를 보존 
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000)
X = vectorizer.fit_transform(df_lyrics['Lyrics'])

# TF-IDF 행렬의 크기 확인
## (5100 × 1000) 크기를 가진 가진 TF-IDF 행렬생성
print('TF-IDF 행렬의 크기 :',X.shape)
print(X)

TF-IDF 행렬의 크기 : (5100, 1000)
  (0, 524)	0.15757310169398842
  (0, 691)	0.26702217704462905
  (0, 909)	0.24319305072566616
  (0, 582)	0.2254431098690145
  (0, 479)	0.2588637519923059
  (0, 641)	0.19549008776758917
  (0, 663)	0.1419341377534707
  (0, 783)	0.17092190472995467
  (0, 193)	0.166231812545645
  (0, 472)	0.20065379980095668
  (0, 159)	0.09708651639577003
  (0, 133)	0.18469837996309232
  (0, 232)	0.07920177498662365
  (0, 478)	0.29561095124746734
  (0, 61)	0.17207981044603918
  (0, 705)	0.19741254796648006
  (0, 860)	0.22594928894018987
  (0, 881)	0.5031308921757852
  (0, 542)	0.24788314290997585
  (1, 924)	0.04333371655373739
  (1, 30)	0.07487943680940269
  (1, 26)	0.08585866364240094
  (1, 935)	0.11081269545336714
  (1, 522)	0.08454502024863672
  (1, 28)	0.09477835442163147
  :	:
  (5098, 159)	0.023996666446102485
  (5098, 232)	0.05872839958163574
  (5098, 478)	0.02435517535714706
  (5098, 881)	0.041452595226355256
  (5099, 10)	0.02489720260032561
  (5099, 381)	0.0615463845491

# LDA Model Training

In [20]:
lda_model = LatentDirichletAllocation(n_components = 10,          # 10개의 topic으로 분류
                                      learning_method = 'online',
                                      random_state = 1117,
                                      max_iter = 1)
lda_top = lda_model.fit_transform(X)

In [21]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[ 0.18644495  0.11181385  0.1120058  ...  0.11156686  0.13288954
   0.11084717]
 [ 0.10985984  0.22699679  0.11103949 ...  0.27802346  0.25258536
   0.11096661]
 [ 0.11119818  0.11415657  0.11229536 ...  0.11340262  0.11353133
   0.11318324]
 ...
 [ 0.10986612  0.50273111  0.1126017  ...  2.82509967  0.38749308
   0.11819752]
 [ 0.11003745  0.11385403  0.11203886 ...  0.38736504  0.43534741
   0.11385114]
 [ 1.67507603 21.47574682  8.17099598 ... 26.58500258 41.57170484
   7.12453453]]
(10, 1000)


In [22]:
# 단어 집합(1,000개의 단어가 저장), 각 topic별로 상위에 rank된 단어를 불러와보자. - 이것으로 topic 주제를 지정할 수 있을 것!
terms = vectorizer.get_feature_names_out()

def get_topics(components, feature_names, n=10):       # n=5 : 각 topic(10개)에서, 상위 10개의 단어씩을 가져온다
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_,terms)

Topic 1: [('kick', 0.51), ('whoa', 0.5), ('sunday', 0.41), ('flower', 0.38), ('hair', 0.35), ('help', 0.33), ('motion', 0.3), ('come', 0.27), ('love', 0.26), ('like', 0.25)]
Topic 2: [('bang', 7.66), ('huh', 7.19), ('born', 3.21), ('woo', 3.03), ('come', 2.73), ('duh', 2.66), ('celebr', 2.31), ('child', 2.16), ('funki', 1.97), ('everybodi', 1.94)]
Topic 3: [('nan', 141.08), ('saturday', 0.51), ('wait', 0.32), ('woman', 0.32), ('work', 0.31), ('send', 0.31), ('tire', 0.25), ('love', 0.22), ('believ', 0.2), ('man', 0.2)]
Topic 4: [('lean', 5.53), ('gangsta', 5.19), ('nana', 4.86), ('diggin', 1.84), ('thoia', 1.67), ('round', 1.61), ('hmm', 1.53), ('homi', 1.24), ('readi', 0.75), ('rap', 0.67)]
Topic 5: [('shake', 19.84), ('boom', 7.6), ('booti', 5.76), ('gon', 5.46), ('hump', 4.94), ('stroke', 3.14), ('hot', 2.81), ('disco', 1.68), ('higher', 1.34), ('love', 0.93)]
Topic 6: [('que', 13.49), ('gimm', 6.3), ('dat', 5.19), ('poison', 5.04), ('whatcha', 3.88), ('cuerpo', 3.55), ('wild', 2.81

# Document별로 topic 할당

In [27]:
# perrcentage를 이용해서, 문서별로 가장 가까운 topic으로 할당해줌
doc_topic = lda_model.transform(X)

doc_per_topic_list = []
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    topic_pr = doc_topic[n].max()
    doc_per_topic_list.append([n, topic_most_pr, topic_pr])
    
doc_topic_df = pd.DataFrame(doc_per_topic_list, columns=['Doc_Num', 'Topic', 'Percentage'])

Unnamed: 0,Doc_Num,Topic,Percentage
0,0,9,0.821073
1,1,9,0.830861
2,2,0,0.1
3,3,9,0.796378
4,4,9,0.815559


In [28]:
# 각 노래별로 토픽이 할당된 것을 확인 - 총 5100개의 노래(data)
len(doc_topic_df)

5100

In [29]:
# original data에 구한 내용(topic, percentage)을 join시킴
doc_topic_df = doc_topic_df.join(df)
doc_topic_df.head()

Unnamed: 0,Doc_Num,Topic,Percentage,Rank,Song,Artist,Year,Lyrics,Source
0,0,9,0.821073,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0
1,1,9,0.830861,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0
2,2,0,0.1,3,i cant get no satisfaction,the rolling stones,1965,,1.0
3,3,9,0.796378,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0
4,4,9,0.815559,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0


In [31]:
# 토픽별로(총 10개의 topic) 문서의 수를 계산
## Topic 10: [('love', 311.01), ('like', 252.67), ('know', 248.21), ('dont', 244.34), ('babi', 211.15), 
##                        ('just', 204.33), ('want', 194.69), ('got', 186.07), ('let', 169.49), ('wanna', 167.11)]

doc_topic_df.groupby('Topic')[['Doc_Num']].count()

Unnamed: 0_level_0,Doc_Num
Topic,Unnamed: 1_level_1
0,85
1,10
2,187
3,3
4,7
5,15
6,5
7,71
8,13
9,4704
