# GenreMatch
Data cleaning and model generation  
*Jeremy Freedman, Reza Madhavan, Kunal Sheth*

In [None]:
!unzip Archive.zip

In [None]:
!ls utils

In [None]:
import pandas as pd
import numpy as np
import utils.songlyrics as sl
from collections import defaultdict
from nltk.corpus import stopwords as sw
from nltk import download as nltk_download
import matplotlib.pyplot as pyplot
import sklearn as sk
from sklearn import tree
import collections
import gc
import math
import pickle
import json

In [None]:
a=pd.read_csv('data/song_lyrics.csv').dropna()
new_df = a.copy()
head = new_df.head(5)

In [None]:
# download the NLTK stopwords list, if necessary
nltk_download('stopwords')

In [None]:
# we need to clean and update the NLTK stopwords for our data
# we're stripping out punctuation entirely, which the stopwords are not equipped to handle
# could optionally add music-centric stopwords ('oh', 'yeah', 'like', etc)

temp_words = sw.words('english')
stopwords = []
additions = ['im', 'ill', 'id', 'oh', 'cant', 'ive']
for w in temp_words:
    stopwords.append(sl._clean(w))
stopwords += additions
print(stopwords)


In [None]:
for index,row in new_df.iterrows():
  if index % 100 == 0:
    print(index)

  song = row['Lyrics']
  x=song.find('\n')
  song2 = sl._clean(song[x+1:])
  new_song = ''
  for i in range(len(song2)):
    char = song2[i]

    if char == '\n':
      new_song += ' '
    elif char == '(' or char == ')':
      new_song += ''
    else:
      new_song += char


    new_song = new_song.encode('ascii','ignore').decode()

  song2 = new_song.split(' ')
  song2 = list(filter(lambda x : x!='' and x not in stopwords, song2))
  song2 = song2[:len(song2)-1]

  new_df.at[index, 'Lyrics'] = song2


In [None]:
wordfreq = []
word_appearance_cnt = defaultdict(int) # how many songs contain word W? used for IDF
for index,row in new_df.iterrows():
  lyrics = row['Lyrics']
  x = dict(collections.Counter(lyrics))
  for w in x:
    word_appearance_cnt[w] += 1
  # mapped = []
  # for k in x.keys():
  #   mapped.append((k,x[k]))
  wordfreq.append(x)
# specify how many words (most popular first) to turn into features:
top_words = {k : v for k, v in sorted(word_appearance_cnt.items(), key=lambda x: x[1], reverse=True)[:1200]}
new_df['WordFreq'] = wordfreq

In [None]:
new_df['UniqueWords']= new_df.apply(lambda row : len(row['WordFreq']), axis = 1)
new_df

In [None]:
# uniq_words = set()
# for x in new_df['WordFreq']:
#   uniq_words = uniq_words.union(x.keys())
uniq_wordict = {k : 0 for k in top_words.keys()}

In [None]:
word_df = new_df[['Genre', 'WordFreq', 'Artist', 'UniqueWords']]

for index,row in word_df.iterrows():
  if index % 100 == 0:
    print(f'\r{100 * index // len(new_df)}%', end='')
  for k in row['WordFreq']:
    if k in top_words.keys():
      # uniq_wordict[k] = row['WordFreq'][k] # original (raw count)
      uniq_wordict[k] = row['WordFreq'][k] * math.log(len(new_df) / word_appearance_cnt[k]) # tf-idf

  word_df.at[index, 'WordFreq'] = uniq_wordict
  word_df.at[index, 'Artist'] = list(uniq_wordict.values()) + [row['UniqueWords']]
  uniq_wordict = {k : 0 for k in top_words.keys()}
print(f'\rDone', end='')
word_df = word_df.replace({'Artist':'WordFreqList'})


In [None]:
# free up memory
gc.collect()

In [None]:
word_df = word_df.rename(columns = {'Artist':'WordFreqList'})
print(word_df)

In [None]:
# write wordlist to disk for frontend use
print(f'total of {len(top_words.keys())} uniq words')
with open('data/words.txt', 'w') as f:
    top_words["SONG_CNT"] = len(new_df)
    f.write(str(top_words))


In [None]:
print(len(word_df.head(1)['WordFreqList'][0]))

In [None]:
# trying various splits
x_split_plt = []
y_split_plt = []
for i in range(10, 91, 10):
    X_train, X_test, Y_train, Y_test = \
        sk.model_selection.train_test_split(list(word_df['WordFreqList']), list(word_df['Genre']), test_size=0.01*i)
    print(f'Split ratio: {len(X_test) / (len(X_test) + len(X_train))}')
    x_split_plt.append(i)
    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train,Y_train)
    preds = clf.predict(X_test)
    y_split_plt.append(np.sum(preds == Y_test)/len(preds))


In [None]:
# plotting split
pyplot.plot(x_split_plt, y_split_plt)
pyplot.title('split ratio vs prediction accuracy')
pyplot.xlabel('% sample used for testing')
pyplot.ylabel('prediction accuracy')
pyplot.show()
best_split = x_split_plt[y_split_plt.index(max(y_split_plt))] * 0.01
print(f'Best split: {best_split}')

In [None]:
# trying various max depths
x_depth_plt = []
y_depth_plt = []
X_train, X_test, Y_train, Y_test = \
        sk.model_selection.train_test_split(list(word_df['WordFreqList']), list(word_df['Genre']), test_size=best_split)
for i in range(1,102, 5):
    print(f'max depth {i}')
    x_depth_plt.append(i)
    clf = tree.DecisionTreeClassifier(max_depth=i)
    clf.fit(X_train,Y_train)
    preds = clf.predict(X_test)
    y_depth_plt.append(np.sum(preds == Y_test)/len(preds))
    print(f'actual depth: {clf.get_depth()}')


In [None]:
# plotting depth
pyplot.plot(x_depth_plt, y_depth_plt)
pyplot.title('max depth vs prediction accuracy')
pyplot.xlabel('max tree depth')
pyplot.ylabel('prediction accuracy')
pyplot.show()
best_depth = x_depth_plt[y_depth_plt.index(max(y_depth_plt))] 
print(f'Best max depth: {best_depth}')

In [None]:
# trying various min leaf sample sizes
x_sample_plt = []
y_sample_plt = []
X_train, X_test, Y_train, Y_test = \
        sk.model_selection.train_test_split(list(word_df['WordFreqList']), list(word_df['Genre']), test_size=best_split)
for i in range(1, 101, 10):
    print(f'min leaf sample {i}')
    x_sample_plt.append(i)
    clf = tree.DecisionTreeClassifier(min_samples_leaf=i, max_depth=best_depth)
    clf.fit(X_train,Y_train)
    preds = clf.predict(X_test)
    y_sample_plt.append(np.sum(preds == Y_test)/len(preds))

In [None]:
# plotting min leaf sample
pyplot.plot(x_sample_plt, y_sample_plt)
pyplot.title('minimum leaf sample vs prediction accuracy')
pyplot.xlabel('minimum leaf sample')
pyplot.ylabel('prediction accuracy')
pyplot.show()
best_leaf_sample = x_sample_plt[y_sample_plt.index(max(y_sample_plt))]
print(f'Best min leaf sample: {best_leaf_sample}')

In [None]:
# commit model to disk
clf_final = tree.DecisionTreeClassifier(min_samples_leaf=best_leaf_sample, max_depth=best_depth)
clf_final.fit(X_train,Y_train)
with open('model_dtc.pkl', 'wb') as f:
    pickle.dump(clf_final, f)
# tree.plot_tree(clf_final)

In [None]:
# assess final accuracy
preds = clf_final.predict(X_test)
print(np.sum(preds == Y_test)/len(preds))




---
# STOP