# GenreMatch
Data cleaning and model generation  
*Jeremy Freedman, Reza Madhavan, Kunal Sheth*

In [41]:
import pandas as pd
import numpy as np
import utils.songlyrics as sl
from collections import defaultdict
from nltk.corpus import stopwords as sw
from nltk import download as nltk_download

In [None]:
# download the NLTK stopwords list, if necessary
nltk_download('stopwords')

In [46]:
# we need to clean and update the NLTK stopwords for our data
# we're stripping out punctuation entirely, which the stopwords are not equipped to handle
# could optionally add music-centric stopwords ('oh', 'yeah', 'like', etc)

temp_words = sw.words('english')
stopwords = []
additions = ['im', 'ill', 'id', 'oh']
for w in temp_words:
    stopwords.append(sl._clean(w))
stopwords += additions


In [47]:
# I'm opening the csv of lyrics from songlyrics.com, which is structured differently than the genius one.
# might need to restructure this code (or the genius csv) if we want to use that too
# the only difference is all the lyrics by each artist are combined into a single cell (as opposed to split by song)
df = pd.read_csv('data/all_sl.csv')
print(f'Imported {len(df)} lines')
genres = set(df['Genre'])
print(f'Identified {len(genres)} genres: {genres}')
lyrics = defaultdict(str)
for genre in genres:
    for artist in list(df[df['Genre'] == genre]['Lyrics']): # grab each row (artist) and select the lyrics cell
        lyrics[genre] += artist # combine each batch of lyrics into the respective genre in the dictionary
print(f'Extracted {len(lyrics)} genres')

Imported 166 lines
Identified 7 genres: {'alternative', 'rock', 'soul_rb', 'rap_hiphop', 'country', 'metal', 'pop'}
Extracted 7 genres


In [48]:
frequency_tables = {}
words_uniq = {}
for (genre,lyric) in lyrics.items():
    frequency_tables[genre] = sl.words_freq(lyric, stopwords)
    words_uniq[genre] = sl.words(lyric, stopwords)
    print(f'[{genre}] Identified {len(words_uniq[genre])} unique words')

[alternative] Identified 5752 unique words
[rock] Identified 6418 unique words
[soul_rb] Identified 4628 unique words
[rap_hiphop] Identified 15106 unique words
[country] Identified 6535 unique words
[metal] Identified 8965 unique words
[pop] Identified 7271 unique words


In [49]:
# as an example, print the top 25 most common terms from each genre!
for genre in frequency_tables:
    print(f'### {genre} ###\n{sorted(frequency_tables[genre].items(), key=lambda x: x[1], reverse=True)[:25]}')

### alternative ###
[('know', 838), ('love', 678), ('time', 652), ('like', 599), ('go', 596), ('got', 543), ('one', 525), ('cant', 522), ('never', 506), ('take', 457), ('back', 447), ('away', 445), ('say', 440), ('come', 432), ('get', 418), ('want', 417), ('right', 405), ('see', 401), ('let', 376), ('yeah', 369), ('way', 369), ('cause', 368), ('na', 351), ('feel', 342), ('ive', 331)]
### rock ###
[('love', 1159), ('yeah', 869), ('know', 793), ('like', 679), ('got', 659), ('one', 628), ('away', 580), ('go', 551), ('get', 525), ('never', 525), ('time', 524), ('want', 520), ('cant', 460), ('come', 440), ('way', 432), ('well', 430), ('say', 423), ('baby', 422), ('ive', 407), ('take', 394), ('gonna', 347), ('see', 346), ('let', 333), ('give', 323), ('back', 309)]
### soul_rb ###
[('love', 1492), ('baby', 1220), ('know', 971), ('yeah', 762), ('like', 627), ('got', 547), ('want', 477), ('come', 457), ('time', 439), ('one', 435), ('wanna', 435), ('go', 424), ('let', 395), ('cant', 382), ('make