# GenreMatch
Data cleaning and model generation  
*Jeremy Freedman, Reza Madhavan, Kunal Sheth*

In [None]:
!unzip Archive.zip

Archive:  Archive.zip
   creating: utils/
  inflating: utils/genius.py         
   creating: utils/__pycache__/
  inflating: utils/grab_lyrics.py    
  inflating: utils/songlyrics.py     
  inflating: utils/grab_genius.ipynb  
  inflating: utils/azlyrics.py       
  inflating: utils/__pycache__/songlyrics.cpython-37.pyc  
   creating: data/
  inflating: data/song_lyrics.csv    
  inflating: data/rock_sl.csv        
  inflating: data/lyrics.txt         
  inflating: data/all_sl.csv         
  inflating: data/country_sl.csv     
  inflating: data/pop_sl.csv         
  inflating: data/rap_sl.csv         
  inflating: data/soul_sl.csv        


In [None]:
!ls utils

azlyrics.py  grab_genius.ipynb	__pycache__
genius.py    grab_lyrics.py	songlyrics.py


In [162]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import songlyrics as sl
from collections import defaultdict
import math
from nltk.corpus import stopwords as sw
from nltk import download as nltk_download
from nltk.sentiment import SentimentIntensityAnalyzer
import sklearn as sk
from sklearn import tree
import collections
import gc

In [None]:
a=pd.read_csv('data/song_lyrics.csv').dropna()
new_df = a.copy()
head = new_df.head(5)

In [None]:
# download the NLTK stopwords list, if necessary
nltk_download(['stopwords','vader_lexicon'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# we need to clean and update the NLTK stopwords for our data
# we're stripping out punctuation entirely, which the stopwords are not equipped to handle
# could optionally add music-centric stopwords ('oh', 'yeah', 'like', etc)

temp_words = sw.words('english')
stopwords = []
additions = ['im', 'ill', 'id', 'oh', 'cant', 'ive']
for w in temp_words:
    stopwords.append(sl._clean(w))
stopwords += additions
print(stopwords)


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', '

In [None]:


for index,row in new_df.iterrows():
  if index % 100 == 0:
    print(index)

  song = row['Lyrics']
  x=song.find('\n')
  song2 = sl._clean(song[x+1:])
  new_song = ''
  for i in range(len(song2)):
    char = song2[i]

    if char == '\n':
      new_song += ' '
    elif char == '(' or char == ')':
      new_song += ''
    else:
      new_song += char


  new_song = new_song.encode('ascii','ignore').decode()
  

  song2 = new_song.split(' ')
  song2 = list(filter(lambda x : x!='' and x not in stopwords, song2))
  song2 = song2[:len(song2)-1]

  new_df.at[index, 'Lyrics'] = song2


In [None]:
wordfreq = []
word_appearance_cnt = defaultdict(int) # how many songs contain word W? used for IDF
for index,row in new_df.iterrows():
  lyrics = row['Lyrics']
  x = dict(collections.Counter(lyrics))
  for w in x:
    word_appearance_cnt[w] += 1
  # mapped = []
  # for k in x.keys():
  #   mapped.append((k,x[k]))
  wordfreq.append(x)
# specify how many words (most popular first) to turn into features:
top_words = {k : v for k, v in sorted(word_appearance_cnt.items(), key=lambda x: x[1], reverse=True)[:1200]}
new_df['WordFreq'] = wordfreq

In [None]:
new_df['UniqueWords']= new_df.apply(lambda row : len(row['WordFreq']), axis = 1)
new_df

In [None]:
# uniq_words = set()
# for x in new_df['WordFreq']:
#   uniq_words = uniq_words.union(x.keys())
uniq_wordict = {k : 0 for k in top_words.keys()}

In [None]:
word_df = new_df[['Genre', 'WordFreq', 'Artist', 'UniqueWords']]

for index,row in word_df.iterrows():
  if index % 100 == 0:
    print(f'\r{100 * index // len(new_df)}%', end='')
  for k in row['WordFreq']:
    if k in top_words.keys():
      # uniq_wordict[k] = row['WordFreq'][k] # original (raw count)
      uniq_wordict[k] = row['WordFreq'][k] * math.log(len(new_df) / word_appearance_cnt[k]) # tf-idf

  word_df.at[index, 'WordFreq'] = uniq_wordict
  word_df.at[index, 'Artist'] = list(uniq_wordict.values()) + [row['UniqueWords']]
  uniq_wordict = {k : 0 for k in top_words.keys()}
print(f'\rDone', end='')
word_df = word_df.replace({'Artist':'WordFreqList'})


In [133]:
d = {}
for index,genre in enumerate(list(set(new_df['Genre']))):
  d[genre] = index

new_df['GenreInt'] = new_df.apply(lambda x : d[x['Genre']], axis = 1)

d = {}
for index,genre in enumerate(top1000keys):
  d[genre] = index + 1

new_df['LyricsInt'] = new_df.apply(lambda x : [d[word] for word in x['Lyrics']], axis = 1)


In [151]:
# make a 80/20 train/test split of lyric words tagged by origin genre
X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(list(new_df['FeatureVector']), list(new_df['GenreInt']), test_size=0.25)

print(f'Split size: {len(X_test) / (len(X_test) + len(X_train))}\nTest set: {len(X_test)}\nTrain set: {len(X_train)}')

Split size: 0.2501226091221187
Test set: 1020
Train set: 3058


In [152]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,Y_train)

DecisionTreeClassifier()

In [153]:
preds = clf.predict(X_test)
np.sum(preds == Y_test)/len(preds)

0.33725490196078434

In [154]:
knn = sk.neighbors.KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, Y_train)

preds = knn.predict(X_test)

print(x,np.sum(preds == Y_test)/len(preds))



14 0.30196078431372547


In [155]:
from sklearn.naive_bayes import GaussianNB

In [156]:
nb = GaussianNB()
nb.fit(X_train, Y_train)

GaussianNB()

In [157]:
preds = nb.predict(X_test)
print(np.sum(preds == Y_test)/len(preds))

0.3872549019607843





---
# STOP

In [None]:
# I'm opening the csv of lyrics from songlyrics.com, which is structured differently than the genius one.
# might need to restructure this code (or the genius csv) if we want to use that too
# the only difference is all the lyrics by each artist are combined into a single cell (as opposed to split by song)
df = pd.read_csv('data/all_sl.csv')
print(f'Imported {len(df)} lines')
genres = set(df['Genre'])
print(f'Identified {len(genres)} genres: {genres}')
lyrics = defaultdict(str)
for genre in genres:
    for artist in list(df[df['Genre'] == genre]['Lyrics']): # grab each row (artist) and select the lyrics cell
        lyrics[genre] += artist # combine each batch of lyrics into the respective genre in the dictionary
print(f'Extracted {len(lyrics)} genres')

Imported 166 lines
Identified 7 genres: {'soul_rb', 'alternative', 'rock', 'metal', 'rap_hiphop', 'pop', 'country'}
Extracted 7 genres


In [None]:
frequency_tables = {}
words_uniq = {}
lines_uniq = {}
for (genre,lyric) in lyrics.items():
    frequency_tables[genre] = sl.words_freq(lyric, stopwords)
    words_uniq[genre] = sl.words(lyric, stopwords)
    print(f'[{genre}] Identified {len(words_uniq[genre])} unique words')

[pop] Identified 7269 unique words
[rap_hiphop] Identified 15104 unique words
[rock] Identified 6416 unique words
[metal] Identified 8963 unique words
[alternative] Identified 5750 unique words
[country] Identified 6533 unique words
[soul_rb] Identified 4626 unique words


In [None]:
len(lyrics['alternative'])

703734

In [None]:
# as an example, print the top 25 most common terms from each genre!
for genre in frequency_tables:
    print(f'### {genre} ###\n{sorted(frequency_tables[genre].items(), key=lambda x: x[1], reverse=True)[:25]}')

### pop ###
[('love', 2008), ('know', 1645), ('like', 1463), ('baby', 1121), ('got', 1087), ('yeah', 1053), ('go', 1007), ('get', 967), ('one', 868), ('cause', 867), ('let', 806), ('girl', 796), ('say', 727), ('make', 723), ('la', 709), ('want', 688), ('wanna', 667), ('time', 656), ('never', 643), ('need', 611), ('take', 606), ('right', 605), ('way', 580), ('heart', 573), ('see', 566)]
### rap_hiphop ###
[('like', 2874), ('got', 2079), ('get', 1991), ('nigga', 1790), ('know', 1750), ('aint', 1316), ('yeah', 1286), ('niggas', 1236), ('bitch', 1206), ('shit', 1184), ('fuck', 1143), ('love', 943), ('back', 891), ('see', 860), ('money', 799), ('go', 784), ('cause', 770), ('baby', 708), ('thats', 705), ('want', 702), ('man', 674), ('make', 669), ('em', 668), ('say', 660), ('right', 591)]
### rock ###
[('love', 1159), ('yeah', 869), ('know', 793), ('like', 679), ('got', 659), ('one', 628), ('away', 580), ('go', 551), ('get', 525), ('never', 525), ('time', 524), ('want', 520), ('come', 440), 

In [None]:
# make a 80/20 train/test split of lyric words tagged by origin genre
X = []
Y = []
for (genre,words) in words_uniq.items():
    X += words
    Y += [genre] * len(words)
X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(X, Y, test_size=0.2)
print(f'Split size: {len(X_test) / (len(X_test) + len(X_train))}\nTest set: {len(X_test)}\nTrain set: {len(X_train)}')


NameError: ignored