# GenreMatch
Data cleaning and model generation  
*Jeremy Freedman, Reza Madhavan, Kunal Sheth*

In [2]:
!pip3 install utils

Collecting utils
  Downloading utils-1.0.1-py2.py3-none-any.whl (21 kB)
Installing collected packages: utils
Successfully installed utils-1.0.1


In [4]:
!unzip Archive.zip

Archive:  Archive.zip
   creating: utils/
  inflating: utils/genius.py         
   creating: utils/__pycache__/
  inflating: utils/grab_lyrics.py    
  inflating: utils/songlyrics.py     
  inflating: utils/grab_genius.ipynb  
  inflating: utils/azlyrics.py       
  inflating: utils/__pycache__/songlyrics.cpython-37.pyc  
   creating: data/
  inflating: data/song_lyrics.csv    
  inflating: data/rock_sl.csv        
  inflating: data/lyrics.txt         
  inflating: data/all_sl.csv         
  inflating: data/country_sl.csv     
  inflating: data/pop_sl.csv         
  inflating: data/rap_sl.csv         
  inflating: data/soul_sl.csv        


In [12]:
!ls utils

azlyrics.py  genius.py	grab_genius.ipynb  grab_lyrics.py  __pycache__


In [179]:
import pandas as pd
import numpy as np
import songlyrics as sl
from collections import defaultdict
from nltk.corpus import stopwords as sw
from nltk import download as nltk_download
import sklearn as sk
import collections

In [180]:
a=pd.read_csv('data/song_lyrics.csv').dropna()
new_df = a.copy()
head = new_df.head(5)

In [181]:
# download the NLTK stopwords list, if necessary
nltk_download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [182]:
# we need to clean and update the NLTK stopwords for our data
# we're stripping out punctuation entirely, which the stopwords are not equipped to handle
# could optionally add music-centric stopwords ('oh', 'yeah', 'like', etc)

temp_words = sw.words('english')
stopwords = []
additions = ['im', 'ill', 'id', 'oh', 'cant', 'ive']
for w in temp_words:
    stopwords.append(sl._clean(w))
stopwords += additions
print(stopwords)


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', '

In [None]:
for index,row in new_df.iterrows():
  if index % 100 == 0:
    print(index)

  song = row['Lyrics']
  x=song.find('\n')
  song2 = sl._clean(song[x+1:])
  new_song = ''
  for i in range(len(song2)):
    char = song2[i]

    if char == '\n':
      new_song += ' '
    elif char == '(' or char == ')':
      new_song += ''
    else:
      new_song += char


    new_song = new_song.encode('ascii','ignore').decode()

  song2 = new_song.split(' ')
  song2 = list(filter(lambda x : x!='' and x not in stopwords, song2))
  song2 = song2[:len(song2)-1]

  new_df.at[index, 'Lyrics'] = song2


In [184]:
wordfreq = []
for index,row in new_df.iterrows():
  lyrics = row['Lyrics']

  x = dict(collections.Counter(lyrics))
  # mapped = []
  # for k in x.keys():
  #   mapped.append((k,x[k]))
  
  wordfreq.append(x)

new_df['WordFreq'] = wordfreq

In [189]:
new_df['UniqueWords']= new_df.apply(lambda row : len(row['WordFreq']), axis = 1)
new_df.groupby(by = ['Genre']).mean()

Unnamed: 0_level_0,Unnamed: 0,UniqueWords
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
alternative,2588.0,61.306087
country,1538.0,70.5392
metal,3766.0,73.4928
pop,300.0,79.154742
rap_hiphop,913.0,182.3648
rock,3164.369151,65.663778
soul_rb,2075.5,63.842222


In [16]:
# I'm opening the csv of lyrics from songlyrics.com, which is structured differently than the genius one.
# might need to restructure this code (or the genius csv) if we want to use that too
# the only difference is all the lyrics by each artist are combined into a single cell (as opposed to split by song)
df = pd.read_csv('data/all_sl.csv')
print(f'Imported {len(df)} lines')
genres = set(df['Genre'])
print(f'Identified {len(genres)} genres: {genres}')
lyrics = defaultdict(str)
for genre in genres:
    for artist in list(df[df['Genre'] == genre]['Lyrics']): # grab each row (artist) and select the lyrics cell
        lyrics[genre] += artist # combine each batch of lyrics into the respective genre in the dictionary
print(f'Extracted {len(lyrics)} genres')

Imported 166 lines
Identified 7 genres: {'pop', 'rap_hiphop', 'rock', 'metal', 'alternative', 'country', 'soul_rb'}
Extracted 7 genres


In [17]:
frequency_tables = {}
words_uniq = {}
lines_uniq = {}
for (genre,lyric) in lyrics.items():
    frequency_tables[genre] = sl.words_freq(lyric, stopwords)
    words_uniq[genre] = sl.words(lyric, stopwords)
    print(f'[{genre}] Identified {len(words_uniq[genre])} unique words')

[pop] Identified 7269 unique words
[rap_hiphop] Identified 15104 unique words
[rock] Identified 6416 unique words
[metal] Identified 8963 unique words
[alternative] Identified 5750 unique words
[country] Identified 6533 unique words
[soul_rb] Identified 4626 unique words


In [30]:
len(lyrics['alternative'])

703734

In [18]:
# as an example, print the top 25 most common terms from each genre!
for genre in frequency_tables:
    print(f'### {genre} ###\n{sorted(frequency_tables[genre].items(), key=lambda x: x[1], reverse=True)[:25]}')

### pop ###
[('love', 2008), ('know', 1645), ('like', 1463), ('baby', 1121), ('got', 1087), ('yeah', 1053), ('go', 1007), ('get', 967), ('one', 868), ('cause', 867), ('let', 806), ('girl', 796), ('say', 727), ('make', 723), ('la', 709), ('want', 688), ('wanna', 667), ('time', 656), ('never', 643), ('need', 611), ('take', 606), ('right', 605), ('way', 580), ('heart', 573), ('see', 566)]
### rap_hiphop ###
[('like', 2874), ('got', 2079), ('get', 1991), ('nigga', 1790), ('know', 1750), ('aint', 1316), ('yeah', 1286), ('niggas', 1236), ('bitch', 1206), ('shit', 1184), ('fuck', 1143), ('love', 943), ('back', 891), ('see', 860), ('money', 799), ('go', 784), ('cause', 770), ('baby', 708), ('thats', 705), ('want', 702), ('man', 674), ('make', 669), ('em', 668), ('say', 660), ('right', 591)]
### rock ###
[('love', 1159), ('yeah', 869), ('know', 793), ('like', 679), ('got', 659), ('one', 628), ('away', 580), ('go', 551), ('get', 525), ('never', 525), ('time', 524), ('want', 520), ('come', 440), 

In [19]:
# make a 80/20 train/test split of lyric words tagged by origin genre
X = []
Y = []
for (genre,words) in words_uniq.items():
    X += words
    Y += [genre] * len(words)
X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(X, Y, test_size=0.2)
print(f'Split size: {len(X_test) / (len(X_test) + len(X_train))}\nTest set: {len(X_test)}\nTrain set: {len(X_train)}')


Split size: 0.20001463566345293
Test set: 10933
Train set: 43728


In [25]:
Y_test

['country',
 'alternative',
 'rap_hiphop',
 'rap_hiphop',
 'rock',
 'alternative',
 'metal',
 'rock',
 'metal',
 'metal',
 'rock',
 'rap_hiphop',
 'rap_hiphop',
 'metal',
 'soul_rb',
 'pop',
 'alternative',
 'rap_hiphop',
 'rap_hiphop',
 'rap_hiphop',
 'pop',
 'pop',
 'rap_hiphop',
 'rock',
 'rap_hiphop',
 'alternative',
 'alternative',
 'rap_hiphop',
 'pop',
 'pop',
 'pop',
 'metal',
 'pop',
 'rap_hiphop',
 'pop',
 'rap_hiphop',
 'rap_hiphop',
 'rap_hiphop',
 'rap_hiphop',
 'country',
 'country',
 'soul_rb',
 'alternative',
 'pop',
 'pop',
 'country',
 'rock',
 'soul_rb',
 'soul_rb',
 'rock',
 'pop',
 'rap_hiphop',
 'pop',
 'soul_rb',
 'pop',
 'rap_hiphop',
 'rap_hiphop',
 'rap_hiphop',
 'soul_rb',
 'rock',
 'pop',
 'country',
 'soul_rb',
 'soul_rb',
 'rock',
 'alternative',
 'rap_hiphop',
 'rap_hiphop',
 'alternative',
 'country',
 'rock',
 'metal',
 'country',
 'rap_hiphop',
 'alternative',
 'rap_hiphop',
 'rock',
 'rap_hiphop',
 'soul_rb',
 'pop',
 'country',
 'alternative',
 'soul