In [1]:
import pandas as pd
import joblib
import numpy as np
import os
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from torchnlp.encoders.text import WhitespaceEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

In [19]:
music = pd.read_csv('lyrics_data_clean.csv')

In [20]:
music.drop(['Unnamed: 0'], axis=1, inplace=True)

In [21]:
music.head()

Unnamed: 0,genre,lyrics
0,pop,hold time feel break feel untrue convince spea...
1,pop,believe drop rain fall grow believe darkest ni...
2,pop,sweetheart send letter goodbye secret feel bet...
3,pop,kiss lips want stroll charm mambo chacha merin...
4,pop,till darling till matter know till dream live ...


## split train/test data set

In [22]:
xtrain, xtest, ytrain, ytest = train_test_split(
    music["lyrics"].to_frame(), music["genre"], test_size=0.20, random_state=42
)

In [23]:
for x_ in (xtrain, xtest):
    print(x_.shape)

xtrain

(22424, 1)
(5607, 1)


Unnamed: 0,lyrics
2252,think proud enemy hand worse thing possess yea...
20274,look away look away pass door lose good good w...
2789,anarchy kill shoot raise army rabid rat beat n...
3953,blame moonlit dream die eagle flight blame moo...
7671,think cause pretty women know doin nothin tear...
...,...
21575,time come test mankind live better reach cruci...
5390,away miss goodbye wanna kiss watch show drink ...
860,today face face know away knees pray away begi...
15795,catch run little hide maybe baby gonna turn fi...


In [24]:
music_train = pd.concat([xtrain, ytrain], axis=1)
music_train.head(5)

Unnamed: 0,lyrics,genre
2252,think proud enemy hand worse thing possess yea...,pop
20274,look away look away pass door lose good good w...,jazz
2789,anarchy kill shoot raise army rabid rat beat n...,pop
3953,blame moonlit dream die eagle flight blame moo...,pop
7671,think cause pretty women know doin nothin tear...,country


In [25]:
# load encoder
with open("encoder.pickle", "rb") as f:
    encoder: WhitespaceEncoder = joblib.load(f)

cv = CountVectorizer(vocabulary = encoder.token_to_index)
xtrain1 = cv.fit_transform(xtrain['lyrics'].tolist())
lda = LatentDirichletAllocation(n_components=7, random_state=42, n_jobs=-1)
lda.fit(xtrain1)



In [26]:
top_k_per_topic = lda.components_.argsort(axis=1)[:, -50:]
for idx, topic in enumerate(top_k_per_topic):
    print("=" * 40 + f"Genre {idx}" + "=" * 40)
    print(encoder.decode(topic[::-1]))
    print()

world like bitch time real high smoke fuck people yeah know life need face live cause come hell roll money look place tell city stay sell think go watch teach right work shit fake young ghost deal line ride stick rule paper read straight race drug drink mind school feel

heart baby go hold know long night tonight leave right believe like time want kiss stay come feel need gonna break eye wait tear tell cause arm girl sweet hand close little yeah darling love start touch look promise apart say wrong wanna true think dream fall tight light line

away good break walk know lonely feel night go wish fool miss stand come dream leave goodbye gonna yeah tell morning heart cry sleep little look whoa cause fade rain think tear say like take baby girl time memory hurt want stay blue right place wonder home dear hide yesterday

fall feel come eye know like inside head cold black dead blood hand stand soul burn hear fear leave pain fight face save lose wall kill death devil lie turn tear light grin

In [27]:
# visualize the topics
# Attention: the number of visualized topics is not the same as the genre above(should use the genre number)
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, xtrain1, cv)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload


In [28]:
# get topic distribution for each document
# find the topic with the highest probability and assign it to the document
topic_dist = lda.transform(xtrain1)
topic_labels = topic_dist.argmax(axis=1)
#put topic labels back to the original dataframe
music_train['topic'] = topic_labels
music_train.head()


Unnamed: 0,lyrics,genre,topic
2252,think proud enemy hand worse thing possess yea...,pop,3
20274,look away look away pass door lose good good w...,jazz,2
2789,anarchy kill shoot raise army rabid rat beat n...,pop,4
3953,blame moonlit dream die eagle flight blame moo...,pop,3
7671,think cause pretty women know doin nothin tear...,country,4


## Genre Mapping --- manually mapping based on percentage of overlap words

In [29]:
# put top 50 words in each topic into a list
top_k_per_topic = lda.components_.argsort(axis=1)[:, -50:]
mapping_list = []
for idx, topic in enumerate(top_k_per_topic):
    mapping_list.append(encoder.decode(topic[::-1]))

In [30]:
mapping_list

['world like bitch time real high smoke fuck people yeah know life need face live cause come hell roll money look place tell city stay sell think go watch teach right work shit fake young ghost deal line ride stick rule paper read straight race drug drink mind school feel',
 'heart baby go hold know long night tonight leave right believe like time want kiss stay come feel need gonna break eye wait tear tell cause arm girl sweet hand close little yeah darling love start touch look promise apart say wrong wanna true think dream fall tight light line',
 'away good break walk know lonely feel night go wish fool miss stand come dream leave goodbye gonna yeah tell morning heart cry sleep little look whoa cause fade rain think tear say like take baby girl time memory hurt want stay blue right place wonder home dear hide yesterday',
 'fall feel come eye know like inside head cold black dead blood hand stand soul burn hear fear leave pain fight face save lose wall kill death devil lie turn tear

In [31]:
genre0 = mapping_list[0].split()
genre1 = mapping_list[1].split()
genre2 = mapping_list[2].split()
genre3 = mapping_list[3].split()
genre4 = mapping_list[4].split()
genre5 = mapping_list[5].split()
genre6 = mapping_list[6].split()


In [32]:
pop = music_train[music_train['genre'] == 'pop']
country = music_train[music_train['genre'] == 'country']
blues = music_train[music_train['genre'] == 'blues']
jazz = music_train[music_train['genre'] == 'jazz']
reggae = music_train[music_train['genre'] == 'reggae']
rock = music_train[music_train['genre'] == 'rock']
hiphop = music_train[music_train['genre'] == 'hip hop']

In [33]:
def top_words(df):
    count = df['lyrics'].str.split(expand=True).stack().value_counts()
    return list(count.index)

pop_words = top_words(pop)[:50]
country_words = top_words(country)[:50]
blues_words = top_words(blues)[:50]
jazz_words = top_words(jazz)[:50]
reggae_words = top_words(reggae)[:50]
rock_words = top_words(rock)[:50]
hiphop_words = top_words(hiphop)[:50]

In [17]:
def compare_words(genre, topic):
    compare = set(genre).intersection(set(topic))
    return len(compare)

In [18]:
for i in range(7):
    number = []
    for j in [pop_words, country_words, blues_words, jazz_words, reggae_words, rock_words, hiphop_words]:
        number.append(compare_words(j, mapping_list[i].split()))
    print(number)

[19, 17, 18, 20, 21, 20, 22]
[33, 32, 31, 32, 27, 30, 22]
[28, 31, 30, 28, 24, 28, 20]
[20, 21, 22, 19, 20, 24, 15]
[28, 23, 27, 26, 29, 23, 34]
[17, 19, 19, 19, 22, 16, 15]
[28, 26, 27, 26, 25, 28, 24]


> 0 ---> rock/jazz --- jazz

> 1 ---> pop

> 2 ---> country

> 3 ---> rock

> 4 ---> hiphop

> 5 ---> reggae

> 6 ---> pop/rock --->blues

In [34]:
# put manually label back to the original dataframe
music_train['genre_manual'] = music_train['genre'].map({'jazz':0, 'pop':1, 'country':2, 'rock':3, 'hip hop':4, 'reggae':5, 'blues':6})
music_train.head()

Unnamed: 0,lyrics,genre,topic,genre_manual
2252,think proud enemy hand worse thing possess yea...,pop,3,1
20274,look away look away pass door lose good good w...,jazz,2,0
2789,anarchy kill shoot raise army rabid rat beat n...,pop,4,1
3953,blame moonlit dream die eagle flight blame moo...,pop,3,1
7671,think cause pretty women know doin nothin tear...,country,4,2


In [35]:
def percentage(df):
    count = 0
    for i in range(len(df)):
        if df.iloc[i,2] == df.iloc[i,3]:
            count += 1
    return count/len(df)

pop = music_train[music_train['genre'] == 'pop']
country = music_train[music_train['genre'] == 'country']
blues = music_train[music_train['genre'] == 'blues']
jazz = music_train[music_train['genre'] == 'jazz']
reggae = music_train[music_train['genre'] == 'reggae']
rock = music_train[music_train['genre'] == 'rock']
hiphop = music_train[music_train['genre'] == 'hip hop']

print("pop correct: " , percentage(pop))
print("country correct: " ,percentage(country))
print("blues correct: " ,percentage(blues))
print("jazz correct: " ,percentage(jazz))
print("reggae correct: " ,percentage(reggae))
print("rock correct: " ,percentage(rock))
print("hiphop correct: " ,percentage(hiphop))

pop correct:  0.18487091532767647
country correct:  0.1360877684407096
blues correct:  0.1926027397260274
jazz correct:  0.060170045781556575
reggae correct:  0.09773662551440329
rock correct:  0.33632148377125193
hiphop correct:  0.6736694677871149


## Generate Data

In [36]:
# sample from topic
def sample_from_topic(topic_idx: int, n_samples: int):
    dis = lda.components_[topic_idx, :]
    dis = dis / dis.sum()

    return encoder.decode(
        np.random.choice(np.arange(encoder.vocab_size), p=dis, size=n_samples)
    )


[sample_from_topic(0,50) for _ in range(7)]

['mind medicine race youth tidal watch chronic bless death people trial watch comfy bitch trap fake ignore dollar ball saw hell sneak light sign people pose teach paradise cause bitch pass high drink people tales history book thing lie nerve world people fade crime ignorance turn identity fourth cake queen',
 'speak time acid thats world reality blow corner quick link book sell disease sign wire need kid control fuckin build sell innocent place laundry place ball assume empire get give reality real mustard like champagne try forest bible proof lord fall paper nooo couple circus cake slide line plan damn',
 'unite people smoke east streets mama keep virgin education know overwhelm smoke pace lighter smoke protect fuck office money join lean talk wheel gift federally machine dash capital nerve beautiful place world catch soul survive ghost bitch afford come school poor like basket think science like finger sink grass fight',
 'circle like devil need situations smoke world real soft milli

In [37]:
genre_mapper = {'jazz':0, 'pop':1, 'country':2, 'rock':3, 'hip hop':4, 'reggae':5, 'blues':6}
genre_mapper = {v: k for k, v in genre_mapper.items()}
genre_mapper

{0: 'jazz',
 1: 'pop',
 2: 'country',
 3: 'rock',
 4: 'hip hop',
 5: 'reggae',
 6: 'blues'}

In [38]:
# get the first lyrics in music_train
len(music_train.iloc[0,0].split())

61

In [39]:
lyric_length_distribution = music_train['lyrics'].apply(lambda r: len(r.split())).values
lyric_length_distribution

array([ 61,  35,  67, ...,  67, 105,  28])

In [40]:
# generate synthetic data based on topic distribution
def generate_data(n_samples_per_genre):
  lyric_length_distribution = music_train['lyrics'].apply(lambda r: len(r.split())).values
  synthetic_data = []
  for idx in range(7):
      for _ in range(n_samples_per_genre):
          synthetic_data.append((sample_from_topic(idx, np.random.choice(lyric_length_distribution)), genre_mapper[idx], idx))
  return synthetic_data
        

> for each topic, generate 1000 data

In [41]:
#synthetic_df = pd.DataFrame(generate_data(4000), columns=['text', 'genre', 'lable'])
synthetic_df = pd.DataFrame(generate_data(4000), columns=['text', 'genre', 'lable'])

In [42]:
#synthetic_df.to_csv('synthetic_data_final.csv', index=False)
synthetic_df.to_csv('synthetic_data_s.csv', index=False)

In [28]:
#synthetic_df.head()

In [43]:
syn = pd.read_csv('synthetic_data_s.csv')
syn.sample(10)

Unnamed: 0,text,genre,lable
21273,food whistle tell take send wrong kitchen come...,reggae,5
7879,face go heart till look dream sorry prove clos...,pop,1
3307,message yeah birth think decide world influenc...,jazz,0
267,decide blunt daddy check drug tint mean real b...,jazz,0
1623,dont decide expire yeah catch greatness envy t...,jazz,0
11372,get good like like people cry best wrong name ...,country,2
26689,chase bridge know help smile later feel defina...,blues,6
7455,go friend anytime star know like baby lover ti...,pop,1
263,poster nasty listen spend bitch drive producer...,jazz,0
25651,win heart dark cause waste place choice fight ...,blues,6


In [44]:
syn_train, syn_test = train_test_split(syn, test_size=0.2, random_state=42)

In [45]:
syn_train

Unnamed: 0,text,genre,lable
3953,cash yeah magazine oklahoma young pay cars jum...,jazz,0
7640,baby baby baby believe come kiss need heart ba...,pop,1
2085,like right public hear feel mean tell middle f...,jazz,0
14263,come shake leave come word eye think electrici...,rock,3
6134,till start long eye door leave sweet morning h...,pop,1
...,...,...,...
21575,yeah kill higher hotel strange sound home song...,reggae,5
5390,go take heart wrong leave heart hear look put ...,pop,1
860,youths like enjoy link ask defense violence ba...,jazz,0
15795,swear yeah strong alive hang cloud stranger qu...,rock,3


In [46]:
syn_train1 = cv.fit_transform(syn_train['text'].tolist())



In [47]:
# get topic distribution for each document
# find the topic with the highest probability
# and assign it to the document
topic_dist = lda.transform(syn_train1)
topic_labels = topic_dist.argmax(axis=1)
#put topic labels back to the original dataframe
syn_train['topic'] = topic_labels
syn_train.head()#get the topic distribution of the synthetic data


Unnamed: 0,text,genre,lable,topic
3953,cash yeah magazine oklahoma young pay cars jum...,jazz,0,0
7640,baby baby baby believe come kiss need heart ba...,pop,1,1
2085,like right public hear feel mean tell middle f...,jazz,0,0
14263,come shake leave come word eye think electrici...,rock,3,3
6134,till start long eye door leave sweet morning h...,pop,1,1


In [48]:
def percentage(df):
    count = 0
    for i in range(len(df)):
        if df.iloc[i,2] == df.iloc[i,3]:
            count += 1
    return count/len(df)

pop = syn_train[syn_train['genre'] == 'pop']
country = syn_train[syn_train['genre'] == 'country']
blues = syn_train[syn_train['genre'] == 'blues']
jazz = syn_train[syn_train['genre'] == 'jazz']
reggae = syn_train[syn_train['genre'] == 'reggae']
rock = syn_train[syn_train['genre'] == 'rock']
hiphop = syn_train[syn_train['genre'] == 'hip hop']

print("pop correct: " , percentage(pop))
print("country correct: " ,percentage(country))
print("blues correct: " ,percentage(blues))
print("jazz correct: " ,percentage(jazz))
print("reggae correct: " ,percentage(reggae))
print("rock correct: " ,percentage(rock))
print("hiphop correct: " ,percentage(hiphop))

pop correct:  0.9993744135126681
country correct:  0.9990630855715178
blues correct:  0.9990571967316153
jazz correct:  0.9984418822062948
reggae correct:  0.9990536277602524
rock correct:  0.9987456883035434
hiphop correct:  0.9993848046754845
