In [2]:
import pandas as pd
import joblib
import numpy as np
import os
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from torchnlp.encoders.text import WhitespaceEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

In [3]:
music = pd.read_csv('lyrics_data_clean.csv')

In [6]:
music.drop(['Unnamed: 0'], axis=1, inplace=True)

In [7]:
music.head()

Unnamed: 0,genre,lyrics
0,pop,hold time feel break feel untrue convince spea...
1,pop,believe drop rain fall grow believe darkest ni...
2,pop,sweetheart send letter goodbye secret feel bet...
3,pop,kiss lips want stroll charm mambo chacha merin...
4,pop,till darling till matter know till dream live ...


## split train/test data set

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(
    music["lyrics"].to_frame(), music["genre"], test_size=0.20, random_state=42
)

In [9]:
for x_ in (xtrain, xtest):
    print(x_.shape)

xtrain

(22697, 1)
(5675, 1)


Unnamed: 0,lyrics
12548,close nights morose beguine begin future look ...
8688,struttin stuff struttin stuff struttin stuff s...
17255,remember remember remember remember moon look ...
3737,russia wiser goodbye travel world learn return...
19343,good good good good good good good good good g...
...,...
21575,greatest punchin right punchin leave punchin f...
5390,lovely lovely vision stick inside memory forge...
860,teardrops heart cryin teardrops heart leave la...
15795,go buy monkey pawn shop broker teach monkey gu...


In [10]:
music_train = pd.concat([xtrain, ytrain], axis=1)
music_train.head(5)

Unnamed: 0,lyrics,genre
12548,close nights morose beguine begin future look ...,blues
8688,struttin stuff struttin stuff struttin stuff s...,country
17255,remember remember remember remember moon look ...,jazz
3737,russia wiser goodbye travel world learn return...,pop
19343,good good good good good good good good good g...,jazz


In [11]:
# load encoder
with open("encoder.pickle", "rb") as f:
    encoder: WhitespaceEncoder = joblib.load(f)

cv = CountVectorizer(vocabulary = encoder.token_to_index)
xtrain1 = cv.fit_transform(xtrain['lyrics'].tolist())
lda = LatentDirichletAllocation(n_components=7, random_state=42, n_jobs=-1)
lda.fit(xtrain1)



In [14]:
top_k_per_topic = lda.components_.argsort(axis=1)[:, -50:]
for idx, topic in enumerate(top_k_per_topic):
    print("=" * 40 + f"Genre {idx}" + "=" * 40)
    print(encoder.decode(topic[::-1]))
    print()

like fuck bitch shit money know yeah real cause smoke need wanna time tell world high look come think want girl feel right people damn life talk woman gotta fuckin roll stay work sell check hard drink city bout go watch face pussy fake catch game hoe live ride straight

heart baby hold night know long tonight right leave kiss like believe feel want come sweet time stay need eye go dream arm wait tell girl hand cause love true touch break close darling tear yeah promise little woman start apart fall wanna look dear light tight lips think till

away go good break know walk lonely home gonna feel leave come heart miss fool tell tear yeah stand wish little goodbye cry baby morning rain look cause think say like want whoa fade stay night take sleep hurt girl place time dream days hide belong memory yesterday wonder pain

fall feel come inside eye like head know dead cold blood black stand burn soul face leave hand fear hear lose fight pain lie death wall turn hell hide live save kill grind 

In [19]:
# visualize the topics
# Attention: the number of visualized topics is not the same as the genre above(should use the genre number)
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, xtrain1, cv)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload


In [16]:
# get topic distribution for each document
# find the topic with the highest probability and assign it to the document
topic_dist = lda.transform(xtrain1)
topic_labels = topic_dist.argmax(axis=1)
#put topic labels back to the original dataframe
music_train['topic'] = topic_labels
music_train.head()


Unnamed: 0,lyrics,genre,topic
12548,close nights morose beguine begin future look ...,blues,1
8688,struttin stuff struttin stuff struttin stuff s...,country,4
17255,remember remember remember remember moon look ...,jazz,5
3737,russia wiser goodbye travel world learn return...,pop,2
19343,good good good good good good good good good g...,jazz,2


## Genre Mapping --- manually mapping based on percentage of overlap words

In [17]:
# put top 50 words in each topic into a list
top_k_per_topic = lda.components_.argsort(axis=1)[:, -50:]
mapping_list = []
for idx, topic in enumerate(top_k_per_topic):
    mapping_list.append(encoder.decode(topic[::-1]))

In [18]:
mapping_list

['like fuck bitch shit money know yeah real cause smoke need wanna time tell world high look come think want girl feel right people damn life talk woman gotta fuckin roll stay work sell check hard drink city bout go watch face pussy fake catch game hoe live ride straight',
 'heart baby hold night know long tonight right leave kiss like believe feel want come sweet time stay need eye go dream arm wait tell girl hand cause love true touch break close darling tear yeah promise little woman start apart fall wanna look dear light tight lips think till',
 'away go good break know walk lonely home gonna feel leave come heart miss fool tell tear yeah stand wish little goodbye cry baby morning rain look cause think say like want whoa fade stay night take sleep hurt girl place time dream days hide belong memory yesterday wonder pain',
 'fall feel come inside eye like head know dead cold blood black stand burn soul face leave hand fear hear lose fight pain lie death wall turn hell hide live save 

In [20]:
genre0 = mapping_list[0].split()
genre1 = mapping_list[1].split()
genre2 = mapping_list[2].split()
genre3 = mapping_list[3].split()
genre4 = mapping_list[4].split()
genre5 = mapping_list[5].split()
genre6 = mapping_list[6].split()


In [21]:
pop = music_train[music_train['genre'] == 'pop']
country = music_train[music_train['genre'] == 'country']
blues = music_train[music_train['genre'] == 'blues']
jazz = music_train[music_train['genre'] == 'jazz']
reggae = music_train[music_train['genre'] == 'reggae']
rock = music_train[music_train['genre'] == 'rock']
hiphop = music_train[music_train['genre'] == 'hip hop']

In [22]:
def top_words(df):
    count = df['lyrics'].str.split(expand=True).stack().value_counts()
    return list(count.index)

pop_words = top_words(pop)[:50]
country_words = top_words(country)[:50]
blues_words = top_words(blues)[:50]
jazz_words = top_words(jazz)[:50]
reggae_words = top_words(reggae)[:50]
rock_words = top_words(rock)[:50]
hiphop_words = top_words(hiphop)[:50]

In [23]:
def compare_words(genre, topic):
    compare = set(genre).intersection(set(topic))
    return len(compare)

In [24]:
for i in range(7):
    number = []
    for j in [pop_words, country_words, blues_words, jazz_words, reggae_words, rock_words, hiphop_words]:
        number.append(compare_words(j, mapping_list[i].split()))
    print(number)

[21, 18, 21, 21, 23, 19, 29]
[31, 30, 30, 32, 26, 27, 21]
[27, 30, 28, 25, 23, 26, 19]
[18, 19, 19, 17, 19, 22, 16]
[27, 24, 27, 26, 27, 23, 31]
[16, 18, 19, 17, 21, 15, 14]
[29, 27, 27, 28, 25, 29, 23]


> 0 ---> hiphop

> 1 ---> jazz

> 2 ---> country

> 3 ---> rock

> 4 ---> hiphop(more like 0) ---> reggae/blues/pop ---> blues

> 5 ---> reggae

> 6 ---> pop/rock ---> pop

In [26]:
# put manually label back to the original dataframe
music_train['genre_manual'] = music_train['genre'].map({'hip hop':0, 'jazz':1, 'country':2, 'rock':3, 'blues':4, 'reggae':5, 'pop':6})
music_train.head()

Unnamed: 0,lyrics,genre,topic,genre_manual
12548,close nights morose beguine begin future look ...,blues,1,4
8688,struttin stuff struttin stuff struttin stuff s...,country,4,2
17255,remember remember remember remember moon look ...,jazz,5,1
3737,russia wiser goodbye travel world learn return...,pop,2,6
19343,good good good good good good good good good g...,jazz,2,1


In [27]:
def percentage(df):
    count = 0
    for i in range(len(df)):
        if df.iloc[i,2] == df.iloc[i,3]:
            count += 1
    return count/len(df)

pop = music_train[music_train['genre'] == 'pop']
country = music_train[music_train['genre'] == 'country']
blues = music_train[music_train['genre'] == 'blues']
jazz = music_train[music_train['genre'] == 'jazz']
reggae = music_train[music_train['genre'] == 'reggae']
rock = music_train[music_train['genre'] == 'rock']
hiphop = music_train[music_train['genre'] == 'hip hop']

print("pop correct: " , percentage(pop))
print("country correct: " ,percentage(country))
print("blues correct: " ,percentage(blues))
print("jazz correct: " ,percentage(jazz))
print("reggae correct: " ,percentage(reggae))
print("rock correct: " ,percentage(rock))
print("hiphop correct: " ,percentage(hiphop))

pop correct:  0.2083628632175762
country correct:  0.16496992133271635
blues correct:  0.1370309951060359
jazz correct:  0.16923570969813745
reggae correct:  0.08875441250630359
rock correct:  0.35015384615384615
hiphop correct:  0.4688385269121813


## Generate Data

In [42]:
# sample from topic
def sample_from_topic(topic_idx: int, n_samples: int):
    comp = lda.components_[topic_idx, :]
    comp = comp / comp.sum()

    return encoder.decode(
        np.random.choice(np.arange(encoder.vocab_size), p=comp, size=n_samples)
    )


[sample_from_topic(0,30) for _ in range(7)]

['know tell year high sense heel hoe get need money turn feel smash japan cheese master taste mama lock credit attend people life wheel urgency activist rain wife bitch bust',
 'crook yeah like live step whoa care commercial maintain tell fridge shit proud fuck hold feature pose flow throw shit seat seat handle teacher school bitch nasty help choose solo',
 'hear real hate smoke shorter money fuck brooklyn woman master constantly order hello catch grace smoke dash pull think come rubber life gold quickly order gate focus like reality wrong',
 'drink television cook know world fake handle nicer hunger plastic pop chase film like slide bigger wanna cross foreign yeah thang brown caution shit honest career break shit cause vet',
 'scenes dumb advice think real nerve fuck coat bold wanna chew hill highway fuck rise acid ball poker cause practice purpose like plug promote money give girl cake girl secure',
 'fine little hear pop ride tell want shit fuck feel funny fee drill bout real strip 

In [43]:
genre_mapper = {'hip hop':0, 'jazz':1, 'country':2, 'rock':3, 'blues':4, 'reggae':5, 'pop':6}
genre_mapper = {v: k for k, v in genre_mapper.items()}
genre_mapper

{0: 'hip hop',
 1: 'jazz',
 2: 'country',
 3: 'rock',
 4: 'blues',
 5: 'reggae',
 6: 'pop'}

In [44]:
# get the first lyrics in music_train
len(music_train.iloc[0,0].split())

35

In [45]:
lyric_length_distribution = music_train['lyrics'].apply(lambda r: len(r.split())).values
lyric_length_distribution

array([ 35,  66,  74, ...,  51,  92, 127])

In [46]:
# generate synthetic data based on topic distribution
def generate_data(n_samples_per_genre):
  lyric_length_distribution = music_train['lyrics'].apply(lambda r: len(r.split())).values
  synthetic_data = []
  for idx in range(7):
      for _ in range(n_samples_per_genre):
          synthetic_data.append((sample_from_topic(idx, np.random.choice(lyric_length_distribution)), genre_mapper[idx], idx))
  return synthetic_data
        

> for each topic, generate 4000 data

In [74]:
synthetic_df = pd.DataFrame(generate_data(4000), columns=['text', 'genre', 'lable'])

In [75]:
#synthetic_df.to_csv('synthetic_data_final.csv', index=False)

In [76]:
#synthetic_df.head()

Unnamed: 0,text,genre,lable
0,bucket vibe hood coast like funny dont wanna s...,hip hop,0
1,movies machine phone undergo fuck girl wild ca...,hip hop,0
2,people check fool feel burn jewelry right slat...,hip hop,0
3,watch come feel empire government compilation ...,hip hop,0
4,chain like real world touch close bring today ...,hip hop,0


In [77]:
syn = pd.read_csv('synthetic_data_final.csv')
syn.sample(10)

Unnamed: 0,text,genre,lable
2427,live fuck white thousand bigger crush plot mon...,hip hop,0
25074,thy better roads learn happiness shoot relatio...,pop,6
16134,head pocket shoot girls hard turn plus beat so...,blues,4
6174,arm need come hold want heart know make cause ...,jazz,1
18195,teeth want friends ship drink bang dime sound ...,blues,4
18521,face brain need strap selfies close booze chic...,blues,4
20199,know garden hear redeem people eye culture pla...,reggae,5
15087,want banner touch mountains hate gold right de...,rock,3
14155,burn soul face taste eye speak body reach fall...,rock,3
8808,feel send away place trouble real yeah wanna m...,country,2


In [78]:
syn_train, syn_test = train_test_split(syn, test_size=0.2, random_state=42)

In [79]:
syn_train

Unnamed: 0,text,genre,lable
3953,cause pull elevate project bitch response fire...,hip hop,0
7640,lose heart right true prayer heart star cause ...,jazz,1
2085,clear tall cruise bring hungry bitch throw tal...,hip hop,0
14263,nerve feel hand fault cold surely enter danger...,rock,3
6134,convergence begin leave come tonight close lea...,jazz,1
...,...,...,...
21575,song london remember book home roll coal borde...,reggae,5
5390,tire baby dark forever dream world crocodile l...,jazz,1
860,school tooth like life different cause heir lo...,hip hop,0
15795,hill head race bare useless course reckless ow...,rock,3


In [80]:
syn_train1 = cv.fit_transform(syn_train['text'].tolist())



In [81]:
# get topic distribution for each document
# find the topic with the highest probability
# and assign it to the document
topic_dist = lda.transform(syn_train1)
topic_labels = topic_dist.argmax(axis=1)
#put topic labels back to the original dataframe
syn_train['topic'] = topic_labels
syn_train.head()#get the topic distribution of the synthetic data


Unnamed: 0,text,genre,lable,topic
3953,cause pull elevate project bitch response fire...,hip hop,0,0
7640,lose heart right true prayer heart star cause ...,jazz,1,1
2085,clear tall cruise bring hungry bitch throw tal...,hip hop,0,0
14263,nerve feel hand fault cold surely enter danger...,rock,3,3
6134,convergence begin leave come tonight close lea...,jazz,1,1


In [82]:
def percentage(df):
    count = 0
    for i in range(len(df)):
        if df.iloc[i,2] == df.iloc[i,3]:
            count += 1
    return count/len(df)

pop = syn_train[syn_train['genre'] == 'pop']
country = syn_train[syn_train['genre'] == 'country']
blues = syn_train[syn_train['genre'] == 'blues']
jazz = syn_train[syn_train['genre'] == 'jazz']
reggae = syn_train[syn_train['genre'] == 'reggae']
rock = syn_train[syn_train['genre'] == 'rock']
hiphop = syn_train[syn_train['genre'] == 'hip hop']

print("pop correct: " , percentage(pop))
print("country correct: " ,percentage(country))
print("blues correct: " ,percentage(blues))
print("jazz correct: " ,percentage(jazz))
print("reggae correct: " ,percentage(reggae))
print("rock correct: " ,percentage(rock))
print("hiphop correct: " ,percentage(hiphop))

pop correct:  0.9993714644877436
country correct:  0.9993753903810119
blues correct:  0.9981544140264534
jazz correct:  0.9993744135126681
reggae correct:  0.9990536277602524
rock correct:  0.9993728441517717
hiphop correct:  0.9993767528825179
