In [1]:
import pandas as pd
import joblib
import numpy as np
import os
import re
import gensim
import random
from gensim.utils import simple_preprocess
import nltk
import gensim.corpora as corpora
from pprint import pprint
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from torchnlp.encoders.text import WhitespaceEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

In [2]:
# for visualization
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

  from imp import reload


In [3]:
music = pd.read_csv('/Users/liuxiaoquan/Documents/703/Final_Project/tcc_ceds_music.csv')

In [4]:
music.sample(3)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
8499,24333,george jones,radio lover,1972,country,curly putman hellard jones speak kiss goodbye ...,125,0.000478,0.000478,0.032713,...,0.075688,0.022332,0.663165,0.507884,0.547188,2.4e-05,0.288953,0.294272,music,0.685714
15111,48277,sonic youth,self-obsessed and sexxee,1994,blues,remember arrive magic marker belly button alri...,73,0.050807,0.055624,0.101347,...,0.001032,0.001032,0.46388,0.624516,0.021987,2.4e-05,0.174567,0.658648,obscene,0.371429
6730,19300,rihanna,sex with me,2016,pop,amaze hard work vacation stay instagram pure t...,99,0.040663,0.00112,0.00112,...,0.00112,0.00112,0.603596,0.768377,0.327309,0.0,0.377576,0.539525,obscene,0.057143


In [5]:
music.shape

(28372, 31)

In [6]:
music1 = music.loc[:,['genre','lyrics']]

In [7]:
music1.sample(3)

Unnamed: 0,genre,lyrics
3506,pop,good thing life go away know go away know foll...
2143,pop,alright come pray hurray hurray wear smoke cig...
7625,country,free burden power blood power blood evil victo...


> 7 genre in totall

In [8]:
music1['genre'].unique()

array(['pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'],
      dtype=object)

In [9]:
# remove puntuation
music1['final_lyrics'] = music1['lyrics'].map(lambda x : re.sub(r"""[!"#\$%&'\(\)\*\+,-\./:;\<=\>?\[\]\^_`\{\|\}~“”’]""", '', x))
#lower case
music1['final_lyrics'] = music1['final_lyrics'].map(lambda x: x.lower())
#remove double space
music1['final_lyrics'] = music1["final_lyrics"].str.replace(r"\s+", " ", regex=True).str.strip()

In [10]:
lemmatizer = WordNetLemmatizer()
music1['final_lyrics'] = music1['final_lyrics'].map(lambda x: ' '.join([lemmatizer.lemmatize(word, pos='v') for word in x.split()]))

In [11]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
#ADD 'im', 'dont','yeah' to stop words if make sense
#stop_words.extend(['oh','ohh','im', 'dont','yeah'])
rm_words = set(stop_words)
music1["rm_ly"] = music1["final_lyrics"].str.split().apply(lambda x: " ".join(word for word in x if word not in rm_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liuxiaoquan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
music_clean = music1.loc[:,['genre','rm_ly']]

In [13]:
music_clean.sample(3)

Unnamed: 0,genre,rm_ly
892,pop,heart days brighter heart feel lighter wrong r...
20693,jazz,quick think good true worst best things foreve...
16564,blues,mouth push hear heart pound hero voice scream ...


In [14]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# Split train and test

In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(
    music_clean["rm_ly"].to_frame(), music_clean["genre"], test_size=0.2, random_state=42)

In [16]:
xtrain.head(5)

Unnamed: 0,rm_ly
12548,close nights morose beguine begin future look ...
8688,struttin stuff struttin stuff struttin stuff s...
17255,remember remember remember remember moon look ...
3737,russia wiser goodbye travel world learn return...
19343,good good good good good good good good good g...


In [17]:
ytrain.head(5)

12548      blues
8688     country
17255       jazz
3737         pop
19343       jazz
Name: genre, dtype: object

In [18]:
data_words = list(sent_to_words(xtrain['rm_ly']))
print(data_words[:1])

[['close', 'nights', 'morose', 'beguine', 'begin', 'future', 'look', 'allibied', 'moment', 'dear', 'dear', 'moment', 'blue', 'songs', 'whoop', 'songs', 'moment', 'need', 'skin', 'touch', 'arm', 'hold', 'tight', 'sweet', 'lips', 'kiss', 'goodnight', 'moment', 'babe', 'ridin', 'high', 'babe', 'care', 'go', 'moment']]


> For each document we create a dictionary reporting how many words and how many times those words appear. Save this to ‘id2word’

In [19]:
#create dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 5), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1)]]


In [20]:
#number of topics
num_topics = 7
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=num_topics,random_state=42)
# Print the Keyword in the 7 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.029*"like" + 0.015*"bitch" + 0.013*"fuck" + 0.012*"shit" + 0.011*"nigga" '
  '+ 0.009*"know" + 0.008*"niggas" + 0.008*"come" + 0.007*"time" + '
  '0.006*"yeah"'),
 (1,
  '0.053*"life" + 0.025*"dream" + 0.024*"come" + 0.024*"live" + 0.021*"want" + '
  '0.019*"know" + 0.017*"mind" + 0.017*"time" + 0.014*"sweet" + 0.012*"love"'),
 (2,
  '0.035*"time" + 0.034*"heart" + 0.030*"know" + 0.025*"go" + 0.025*"feel" + '
  '0.022*"away" + 0.021*"long" + 0.021*"break" + 0.018*"baby" + 0.017*"leave"'),
 (3,
  '0.040*"sing" + 0.034*"song" + 0.029*"play" + 0.022*"hear" + 0.016*"ready" + '
  '0.016*"woman" + 0.015*"music" + 0.011*"fool" + 0.010*"summer" + '
  '0.009*"come"'),
 (4,
  '0.029*"yeah" + 0.026*"good" + 0.025*"right" + 0.020*"like" + 0.019*"know" + '
  '0.018*"better" + 0.017*"money" + 0.016*"cause" + 0.015*"baby" + '
  '0.014*"time"'),
 (5,
  '0.019*"like" + 0.016*"world" + 0.016*"away" + 0.012*"believe" + '
  '0.012*"know" + 0.011*"head" + 0.010*"come" + 0.009*"black" + 0.008*"yea

In [21]:
# Visualize the topics ----注意！这里的topic数根上面对应的不一样 ！！！ 以上面的为准
pyLDAvis.enable_notebook()
vis_filepath = os.path.join('/Users/liuxiaoquan/Documents/703/Final_Project/LDA'+str(num_topics))

if not os.path.isfile(vis_filepath):
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(vis_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(vis_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, '/Users/liuxiaoquan/Documents/703/Final_Project/LDA'+str(num_topics)+'.html')

LDAvis_prepared


# Genre Mapping --- manually mapping based on percentage of overlap words

In [22]:
# top 30 words in each topic
top_words = 30
topic_summaries = []
topic_word = lda_model.show_topics(formatted=False, num_words=top_words)
for topic, words in topic_word:
    topic_summaries.append(' '.join([i[0] for i in words]))

topic_summaries


['like bitch fuck shit nigga know niggas come time yeah dead cause beat kill smoke want pull real blow game walk kick wanna white drop turn high bang play brain',
 'life dream come live want know mind time sweet love change world think true things wanna like need wait somebody alive yeah baby girl feel hold lover tell cause wish',
 'time heart know go feel away long break baby leave like need night stay cause hurt lonely think tell walk want start come take right kiss look yeah lose miss',
 'sing song play hear ready woman music fool summer come shoot know wish like call sound listen learn songs ring word blue dance go write guitar radio game bring say',
 'yeah good right like know better money cause baby time wanna gonna gotta girl tell want bout think come fuck look little talk need lookin nothin feel damn stop shit',
 'like world away believe know head come black yeah change people live cause go want tell look stand party think days give need face feel time life say hand place',
 'c

In [23]:
genre0 = topic_summaries[0].split()
genre1 = topic_summaries[1].split()
genre2 = topic_summaries[2].split()
genre3 = topic_summaries[3].split()
genre4 = topic_summaries[4].split()
genre5 = topic_summaries[5].split()
genre6 = topic_summaries[6].split()


In [24]:
# pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'
pop = music_clean[music_clean['genre'] == 'pop']
country = music_clean[music_clean['genre'] == 'country']
blues = music_clean[music_clean['genre'] == 'blues']
jazz = music_clean[music_clean['genre'] == 'jazz']
reggae = music_clean[music_clean['genre'] == 'reggae']
rock = music_clean[music_clean['genre'] == 'rock']
hiphop = music_clean[music_clean['genre'] == 'hip hop']

In [25]:
# put top words in each genre into a list
def top_words(df):
    count = df['rm_ly'].str.split(expand=True).stack().value_counts()
    return list(count.index)

pop_words = top_words(pop)[:30]
country_words = top_words(country)[:30]
blues_words = top_words(blues)[:30]
jazz_words = top_words(jazz)[:30]
reggae_words = top_words(reggae)[:30]
rock_words = top_words(rock)[:30]
hiphop_words = top_words(hiphop)[:30]


In [26]:
# compare top words in genre and top words in topic
def compare_words(genre, topic):
    compare = set(genre).intersection(set(topic))
    return len(compare)

In [27]:
compare_words(pop_words, genre0)
compare_words(pop_words, genre1)

18

In [28]:
for i in range(7):
    number = []
    for j in [pop_words, country_words, blues_words, jazz_words, reggae_words, rock_words, hiphop_words]:
        number.append(compare_words(j, topic_summaries[i].split()))
    print(number)

[8, 7, 7, 7, 8, 7, 15]
[18, 16, 16, 18, 17, 17, 15]
[21, 20, 21, 20, 20, 21, 17]
[4, 5, 6, 5, 5, 5, 5]
[18, 15, 16, 15, 17, 15, 18]
[17, 16, 17, 16, 19, 17, 15]
[14, 15, 14, 13, 13, 15, 8]


## Mapping --- 0 & 1 & 5 overlap
> 0 ---> hiphop

> 1 ---> pop/jazz ---> jazz

> 2 ---> pop/blues/rock ---> rock

> 3 ---> blues

> 4 ---> pop/hiphop --->pop

> 5 ---> reggae

> 6 ---> country/rock ---> country

In [29]:
# generate topic distribution for each document
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_words):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] + ['Text']
    return(sent_topics_df)

In [30]:
dominate = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_words)
dominate.head(10)

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
0,2,0.5253,"time, heart, know, go, feel, away, long, break...","[close, nights, morose, beguine, begin, future..."
1,4,0.9081,"yeah, good, right, like, know, better, money, ...","[struttin, stuff, struttin, stuff, struttin, s..."
2,6,0.733,"come, fall, hold, night, know, home, hand, ton...","[remember, remember, remember, remember, moon,..."
3,1,0.651,"life, dream, come, live, want, know, mind, tim...","[russia, wiser, goodbye, travel, world, learn,..."
4,4,0.9286,"yeah, good, right, like, know, better, money, ...","[good, good, good, good, good, good, good, goo..."
5,2,0.5304,"time, heart, know, go, feel, away, long, break...","[sorry, presume, respect, heart, mean, underst..."
6,6,0.6268,"come, fall, hold, night, know, home, hand, ton...","[speak, caress, mind, speak, reach, time]"
7,5,0.5517,"like, world, away, believe, know, head, come, ...","[skull, ohehoh, like, hungry, tick, clock, foo..."
8,0,0.895,"like, bitch, fuck, shit, nigga, know, niggas, ...","[pray, waaaaay, yeah, yeah, remember, syrup, s..."
9,6,0.6445,"come, fall, hold, night, know, home, hand, ton...","[dimenticar, mean, forget, darling, forget, me..."


In [31]:
# concate xtrain and ytrain
music_train = pd.concat([xtrain, ytrain], axis=1)
music_train.head(5)

Unnamed: 0,rm_ly,genre
12548,close nights morose beguine begin future look ...,blues
8688,struttin stuff struttin stuff struttin stuff s...,country
17255,remember remember remember remember moon look ...,jazz
3737,russia wiser goodbye travel world learn return...,pop
19343,good good good good good good good good good g...,jazz


In [32]:
music_train['lable'] = music_train['genre'].map({'pop':4, 'country':6, 'blues':3, 'jazz':1, 'reggae':5, 'rock':2, 'hip hop':0})
music_train.head(5)

Unnamed: 0,rm_ly,genre,lable
12548,close nights morose beguine begin future look ...,blues,3
8688,struttin stuff struttin stuff struttin stuff s...,country,6
17255,remember remember remember remember moon look ...,jazz,1
3737,russia wiser goodbye travel world learn return...,pop,4
19343,good good good good good good good good good g...,jazz,1


In [33]:
# reset index from 0
music_train = music_train.reset_index(drop=True)

In [34]:
music_train.head(5)

Unnamed: 0,rm_ly,genre,lable
0,close nights morose beguine begin future look ...,blues,3
1,struttin stuff struttin stuff struttin stuff s...,country,6
2,remember remember remember remember moon look ...,jazz,1
3,russia wiser goodbye travel world learn return...,pop,4
4,good good good good good good good good good g...,jazz,1


In [35]:
#join music_train and dominate based on index
music_join = music_train.join(dominate)
music_join.sample(10)

Unnamed: 0,rm_ly,genre,lable,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
18382,humanoid escapee android seek freedom beneath ...,rock,2,5,0.412,"like, world, away, believe, know, head, come, ...","[humanoid, escapee, android, seek, freedom, be..."
14038,goddamn roll kinda stuff save souls horny thro...,blues,3,4,0.6208,"yeah, good, right, like, know, better, money, ...","[goddamn, roll, kinda, stuff, save, souls, hor..."
4924,streets go fuck walk spitter walk say walk say...,hip hop,0,0,0.3462,"like, bitch, fuck, shit, nigga, know, niggas, ...","[streets, go, fuck, walk, spitter, walk, say, ..."
7261,know crazy cross eye dumb dirty smelly lovely ...,blues,3,6,0.3561,"come, fall, hold, night, know, home, hand, ton...","[know, crazy, cross, eye, dumb, dirty, smelly,..."
8101,shame waste youth spade spade things things ye...,jazz,1,1,0.2554,"life, dream, come, live, want, know, mind, tim...","[shame, waste, youth, spade, spade, things, th..."
16206,trouble disturb care worry trouble days feel s...,pop,4,6,0.693,"come, fall, hold, night, know, home, hand, ton...","[trouble, disturb, care, worry, trouble, days,..."
9023,captive slave boat row swamp throw lake alliga...,blues,3,6,0.9699,"come, fall, hold, night, know, home, hand, ton...","[captive, slave, boat, row, swamp, throw, lake..."
1826,sadder lonelier lonely misery like time equal ...,blues,3,6,0.3785,"come, fall, hold, night, know, home, hand, ton...","[sadder, lonelier, lonely, misery, like, time,..."
15419,wanna lose want walk away good tell forever ca...,pop,4,2,0.887,"time, heart, know, go, feel, away, long, break...","[wanna, lose, want, walk, away, good, tell, fo..."
13132,hear news baby disease yeah want baby somethin...,rock,2,4,0.8742,"yeah, good, right, like, know, better, money, ...","[hear, news, baby, disease, yeah, want, baby, ..."


In [36]:
#subset music_join based on genre
music_join_pop = music_join[music_join['genre'] == 'pop'].loc[:,['genre','lable', 'Dominant_Topic']]
music_join_country = music_join[music_join['genre'] == 'country'].loc[:,['genre','lable', 'Dominant_Topic']]
music_join_blues = music_join[music_join['genre'] == 'blues'].loc[:,['genre','lable', 'Dominant_Topic']]
music_join_jazz = music_join[music_join['genre'] == 'jazz'].loc[:,['genre','lable', 'Dominant_Topic']]
music_join_reggae = music_join[music_join['genre'] == 'reggae'].loc[:,['genre','lable', 'Dominant_Topic']]
music_join_rock = music_join[music_join['genre'] == 'rock'].loc[:,['genre','lable', 'Dominant_Topic']]

In [37]:
# calculate the percentage when dominant topic is the same as label
def percentage(df):
    count = 0
    for i in range(len(df)):
        if df.iloc[i,1] == df.iloc[i,2]:
            count += 1
    return count/len(df)

print("pop correct: " , percentage(music_join_pop))
print("country correct: " ,percentage(music_join_country))
print("blues correct: " ,percentage(music_join_blues))
print("jazz correct: " ,percentage(music_join_jazz))
print("reggae correct: " ,percentage(music_join_reggae))
print("rock correct: " ,percentage(music_join_rock))


pop correct:  0.11091424521615875
country correct:  0.25404905136510875
blues correct:  0.07041870581837956
jazz correct:  0.14418754014129737
reggae correct:  0.2344931921331316
rock correct:  0.23846153846153847


## Find top words based on train data

> Almost the same, no obvious difference

In [38]:
popnew = music_train[music_train['genre'] == 'pop']
countrynew = music_train[music_train['genre'] == 'country']
bluesnew = music_train[music_train['genre'] == 'blues']
jazznew = music_train[music_train['genre'] == 'jazz']
reggaenew = music_train[music_train['genre'] == 'reggae']
rocknew = music_train[music_train['genre'] == 'rock']
hiphopnew = music_train[music_train['genre'] == 'hip hop']

In [39]:
pop_words_new = top_words(popnew)[:30]
country_words_new = top_words(countrynew)[:30]
blues_words_new = top_words(bluesnew)[:30]
jazz_words_new = top_words(jazznew)[:30]
reggae_words_new = top_words(reggaenew)[:30]
rock_words_new = top_words(rocknew)[:30]
hiphop_words_new = top_words(hiphopnew)[:30]


In [40]:
for i in range(7):
    number = []
    for j in [pop_words_new, country_words_new, blues_words_new, jazz_words_new, reggae_words_new, rock_words_new, hiphop_words_new]:
        number.append(compare_words(j, topic_summaries[i].split()))
    print(number)

[8, 7, 7, 7, 8, 7, 14]
[18, 17, 15, 18, 17, 16, 15]
[21, 20, 20, 20, 20, 20, 17]
[4, 5, 6, 5, 4, 5, 5]
[18, 15, 15, 15, 17, 14, 18]
[17, 16, 17, 16, 19, 18, 15]
[14, 15, 15, 13, 14, 16, 8]


## Synthetic data

### try to use toppic keyword instead

In [67]:
pprint(lda_model.print_topics(num_words=100))

[(0,
  '0.029*"like" + 0.015*"bitch" + 0.013*"fuck" + 0.012*"shit" + 0.011*"nigga" '
  '+ 0.009*"know" + 0.008*"niggas" + 0.008*"come" + 0.007*"time" + '
  '0.006*"yeah" + 0.005*"dead" + 0.005*"cause" + 0.005*"beat" + 0.004*"kill" + '
  '0.004*"smoke" + 0.004*"want" + 0.004*"pull" + 0.004*"real" + 0.004*"blow" + '
  '0.003*"game" + 0.003*"walk" + 0.003*"kick" + 0.003*"wanna" + 0.003*"white" '
  '+ 0.003*"drop" + 0.003*"turn" + 0.003*"high" + 0.003*"bang" + 0.003*"play" '
  '+ 0.003*"brain" + 0.003*"light" + 0.003*"think" + 0.003*"head" + '
  '0.003*"pussy" + 0.003*"need" + 0.003*"drink" + 0.003*"holy" + 0.003*"block" '
  '+ 0.002*"get" + 0.002*"look" + 0.002*"street" + 0.002*"feel" + '
  '0.002*"start" + 0.002*"right" + 0.002*"open" + 0.002*"dirty" + 0.002*"tell" '
  '+ 0.002*"bust" + 0.002*"tryna" + 0.002*"break" + 0.002*"suck" + '
  '0.002*"lyric" + 0.002*"catch" + 0.002*"step" + 0.002*"leave" + 0.002*"flow" '
  '+ 0.002*"house" + 0.002*"throw" + 0.002*"black" + 0.002*"hard" + '
  '0

In [68]:
#get the top 100 words in each topic and their frequency
top_words = []
for i in range(7):
    top_words.append(lda_model.show_topic(i, topn=100))
    top_words[i] = pd.DataFrame(top_words[i])
    top_words[i].columns = ['word', 'freq']

In [69]:
top_words[0]

Unnamed: 0,word,freq
0,like,0.029290
1,bitch,0.014592
2,fuck,0.012766
3,shit,0.011615
4,nigga,0.011494
...,...,...
95,take,0.001515
96,gotta,0.001514
97,little,0.001510
98,outside,0.001504


> Sample from each topic based on the frequency of each word

In [70]:
def sample_from_topic_new(topic, n):
    df = top_words[topic]
    #choose n words from column word based on the frequency in column freq
    # put the result into a list
    sentence = random.choices(df['word'], weights=df['freq'], k=n)
    # join the list into a string
    sentence = ' '.join(sentence)
    return sentence


In [71]:
sample_from_topic_new(0,15)
sample_from_topic_new(1,15)
sample_from_topic_new(2,15)
sample_from_topic_new(3,15)
sample_from_topic_new(4,15)
sample_from_topic_new(5,15)
sample_from_topic_new(6,15)


'inside inside think good head rain hold home deep light head tell smile face gonna'

In [72]:
genre = ['pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hiphop']
genre_mapper = {0:'hiphop',1:'jazz',2:'rock',3:'blues',4:'pop',5:'reggae',6:'country'}

In [73]:
lyric_length_distribution = music_join['Text'].apply(len)
lyric_length_distribution

0         35
1         66
2         74
3         25
4         11
        ... 
22692     74
22693     82
22694     51
22695     92
22696    127
Name: Text, Length: 22697, dtype: int64

In [74]:
# generate synthetic data based on topic distribution
def generate_data(n_samples_per_genre):
    lyric_length_distribution = music_join['Text'].apply(len)  
    synthetic_data = []
    for idx in range(7):
        for _ in range(n_samples_per_genre):
            synthetic_data.append((sample_from_topic_new(idx, np.random.choice(lyric_length_distribution)), genre_mapper[idx], idx))
    return synthetic_data
        

> For each topic, generate 2000 data

In [75]:
#synthetic_df = pd.DataFrame(generate_data(2000), columns=['Text', 'genre', 'lable'])

In [76]:
#synthetic_df.shape

(14000, 3)

In [77]:
#synthetic_df.to_csv('synthetic_data_new.csv', index=False)

In [78]:
#synthetic_df.head()

Unnamed: 0,Text,genre,lable
0,white cause kick high little like nigga like b...,hiphop,0
1,head roll tell feel bring dirty kill stop high...,hiphop,0
2,catch hard know walk black bitch brain right b...,hiphop,0
3,drink green best cause want pull dead fuck ban...,hiphop,0
4,little game nigga know white start turn step s...,hiphop,0


In [79]:
syn = pd.read_csv('synthetic_data_new.csv')
syn.head()

Unnamed: 0,Text,genre,lable
0,white cause kick high little like nigga like b...,hiphop,0
1,head roll tell feel bring dirty kill stop high...,hiphop,0
2,catch hard know walk black bitch brain right b...,hiphop,0
3,drink green best cause want pull dead fuck ban...,hiphop,0
4,little game nigga know white start turn step s...,hiphop,0


In [80]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [81]:
# split train and test data
syn_train, syn_test = train_test_split(syn, test_size=0.2, random_state=42)

In [83]:
syn_train.head()

Unnamed: 0,Text,genre,lable
11756,believe come lose paper stay feel soon look bl...,reggae,5
3710,girl come know share near live lady come alive...,jazz,1
3571,work want yeah smile mind life rest want maybe...,jazz,1
6926,say sing ready say swing play think dear littl...,blues,3
6224,song road time song listen play people play ra...,blues,3


In [84]:
data_words_syn = list(sent_to_words(syn_train['Text']))
print(data_words_syn[:1])

[['believe', 'come', 'lose', 'paper', 'stay', 'feel', 'soon', 'look', 'black', 'change', 'black', 'turn', 'like', 'go', 'look', 'look', 'gonna', 'come', 'yeah', 'truth', 'go', 'know', 'wanna', 'right', 'shut', 'place', 'head', 'life', 'follow', 'believe', 'days', 'away', 'believe', 'people', 'turn', 'life', 'like', 'work', 'give', 'party', 'coast', 'believe', 'come', 'kill', 'black', 'turn', 'go', 'go', 'comin', 'hear', 'give', 'like', 'know', 'tell', 'stay', 'life', 'soon', 'world', 'believe', 'round', 'hard', 'like', 'away', 'time', 'turn', 'round', 'right']]


In [85]:
#create dictionary
id2word = corpora.Dictionary(data_words_syn)
# Create Corpus
texts = data_words_syn
# Term Document Frequency
corpus_syn = [id2word.doc2bow(text) for text in texts]
# View
print(corpus_syn[:1])

[[(0, 2), (1, 5), (2, 3), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 4), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 3), (19, 4), (20, 3), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 2), (28, 1), (29, 2), (30, 2), (31, 1), (32, 1), (33, 1), (34, 4), (35, 1), (36, 1), (37, 1), (38, 1)]]


In [86]:
p = corpus_syn[:1][0]
print(lda_model.get_document_topics(p))

[(0, 0.12343105), (2, 0.42276666), (3, 0.18913013), (6, 0.25728333)]


In [87]:
dominate1 = format_topics_sentences(ldamodel=lda_model, corpus=corpus_syn, texts=data_words_syn)
dominate1.head(10)

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
0,2,0.4228,"time, heart, know, go, feel, away, long, break...","[believe, come, lose, paper, stay, feel, soon,..."
1,4,0.6005,"yeah, good, right, like, know, better, money, ...","[girl, come, know, share, near, live, lady, co..."
2,4,0.4924,"yeah, good, right, like, know, better, money, ...","[work, want, yeah, smile, mind, life, rest, wa..."
3,6,0.5705,"come, fall, hold, night, know, home, hand, ton...","[say, sing, ready, say, swing, play, think, de..."
4,2,0.3442,"time, heart, know, go, feel, away, long, break...","[song, road, time, song, listen, play, people,..."
5,2,0.4834,"time, heart, know, go, feel, away, long, break...","[give, give, give, world, believe, like, like,..."
6,2,0.3663,"time, heart, know, go, feel, away, long, break...","[away, turn, yeah, know, world, coast, feel, h..."
7,4,0.5106,"yeah, good, right, like, know, better, money, ...","[come, know, go, girl, time, yeah, gonna, love..."
8,5,0.4213,"like, world, away, believe, know, head, come, ...","[fall, leave, world, today, rain, inside, head..."
9,2,0.2986,"time, heart, know, go, feel, away, long, break...","[give, change, stick, want, high, stay, run, k..."


In [88]:
syn_train = syn_train.reset_index(drop=True)
#rename the column Text to Text_new
syn_train = syn_train.rename(columns={'Text':'Text_new'})
syn_train.head()

Unnamed: 0,Text_new,genre,lable
0,believe come lose paper stay feel soon look bl...,reggae,5
1,girl come know share near live lady come alive...,jazz,1
2,work want yeah smile mind life rest want maybe...,jazz,1
3,say sing ready say swing play think dear littl...,blues,3
4,song road time song listen play people play ra...,blues,3


In [89]:
syn_join = syn_train.join(dominate1)
syn_join.head(10)

Unnamed: 0,Text_new,genre,lable,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
0,believe come lose paper stay feel soon look bl...,reggae,5,2,0.4228,"time, heart, know, go, feel, away, long, break...","[believe, come, lose, paper, stay, feel, soon,..."
1,girl come know share near live lady come alive...,jazz,1,4,0.6005,"yeah, good, right, like, know, better, money, ...","[girl, come, know, share, near, live, lady, co..."
2,work want yeah smile mind life rest want maybe...,jazz,1,4,0.4924,"yeah, good, right, like, know, better, money, ...","[work, want, yeah, smile, mind, life, rest, wa..."
3,say sing ready say swing play think dear littl...,blues,3,6,0.5705,"come, fall, hold, night, know, home, hand, ton...","[say, sing, ready, say, swing, play, think, de..."
4,song road time song listen play people play ra...,blues,3,2,0.3442,"time, heart, know, go, feel, away, long, break...","[song, road, time, song, listen, play, people,..."
5,give give give world believe like like need tr...,reggae,5,2,0.4834,"time, heart, know, go, feel, away, long, break...","[give, give, give, world, believe, like, like,..."
6,away turn yeah know world coast feel head want...,reggae,5,2,0.3663,"time, heart, know, go, feel, away, long, break...","[away, turn, yeah, know, world, coast, feel, h..."
7,come know go girl time yeah gonna love look ha...,jazz,1,4,0.5106,"yeah, good, right, like, know, better, money, ...","[come, know, go, girl, time, yeah, gonna, love..."
8,fall leave world today rain inside head sleep ...,country,6,5,0.4213,"like, world, away, believe, know, head, come, ...","[fall, leave, world, today, rain, inside, head..."
9,give change stick want high stay run kill touc...,reggae,5,2,0.2986,"time, heart, know, go, feel, away, long, break...","[give, change, stick, want, high, stay, run, k..."


In [90]:
syn_join_pop = syn_join[syn_join['genre'] == 'pop'].loc[:,['genre','lable', 'Dominant_Topic']]
syn_join_country = syn_join[syn_join['genre'] == 'country'].loc[:,['genre','lable', 'Dominant_Topic']]
syn_join_blues = syn_join[syn_join['genre'] == 'blues'].loc[:,['genre','lable', 'Dominant_Topic']]
syn_join_jazz = syn_join[syn_join['genre'] == 'jazz'].loc[:,['genre','lable', 'Dominant_Topic']]
syn_join_reggae = syn_join[syn_join['genre'] == 'reggae'].loc[:,['genre','lable', 'Dominant_Topic']]
syn_join_rock = syn_join[syn_join['genre'] == 'rock'].loc[:,['genre','lable', 'Dominant_Topic']]
syn_join_hiphop = syn_join[syn_join['genre'] == 'hiphop'].loc[:,['genre','lable', 'Dominant_Topic']]

In [91]:
print("pop correct: " , percentage(syn_join_pop))
print("country correct: " , percentage(syn_join_country))
print("blues correct: " , percentage(syn_join_blues))
print("jazz correct: " , percentage(syn_join_jazz))
print("reggae correct: " , percentage(syn_join_reggae))
print("rock correct: " , percentage(syn_join_rock))
print("hiphop correct: " , percentage(syn_join_hiphop))


pop correct:  0.0018963337547408343
country correct:  0.2526446795270691
blues correct:  0.01371571072319202
jazz correct:  0.0749063670411985
reggae correct:  0.02540272614622057
rock correct:  0.10387984981226533
hiphop correct:  0.896421845574388
