In [None]:
import pandas as pd
from time import time
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from scipy.spatial.distance import cosine 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import svm
import seaborn as sns

le=LabelEncoder()
ss=StandardScaler()
from gensim import matutils
from collections import Counter
import numpy as np
from ast import literal_eval
from sklearn.manifold import TSNE
tsne=TSNE(metric='cosine')

import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
%matplotlib inline


# Combining Both Models

Since the word2vec and feature models represent different aspects of the data, we hope to get a better prediction by combining the two. This new model with include the features used previously, as well as the similarity score to each genre from the word2vec model.

In [None]:
# load data

model = Doc2Vec.load("../Data/doc2vec100.model")

song_df=pd.read_csv("../Data/songdata_v2.csv",sep=',',encoding='utf-8', usecols=['lyrics','genre','song_length','complexity','language'])
print(len(song_df))
song_df.head()

## Prepare Features

Word2vec Genre Similarities

In [None]:
with open("../Data/topics-anthology.txt",'r') as f:
    topics=f.read()
    topics=pd.DataFrame([t.split('\t') for t in topics.split('\n')], columns=['Genre','words'])
    topics['Genre']=[t.title() for t in topics.Genre]
    topics.set_index('Genre', inplace=True)

topics['vector']=[model.infer_vector(w) for w in topics.words]
topics

for i,row in song_df.iterrows():
    for g in topics.index:
        song_df.at[i,'_sim'.format(g)]=1-cosine(model.docvecs[str(i)],topics.loc[g]['vector'])

song_df.head()

Profanity

In [None]:
with open('../Data/profanity.txt', 'r') as f:
    profanity = [x for x in f.read().split('\n') if x]

count_vectorizer = CountVectorizer(vocabulary=profanity)
count = count_vectorizer.fit_transform(song_df.lyrics.tolist())
count = pd.DataFrame(count.toarray(), columns=count_vectorizer.get_feature_names())
song_df['profanity']=count.sum(axis=1).div(song_df.song_length)

Reference to first person

In [None]:
vocab=['i','me','my', 'myself']

count_vectorizer = CountVectorizer(vocabulary=vocab)
count = count_vectorizer.fit_transform(song_df.lyrics.tolist())
count = pd.DataFrame(count.toarray(), columns=count_vectorizer.get_feature_names())
song_df['1st_ref']=count.sum(axis=1).div(song_df.song_length)

Reference to second person

In [None]:
vocab=['you','your', 'yourself']

count_vectorizer = CountVectorizer(vocabulary=vocab)
count = count_vectorizer.fit_transform(song_df.lyrics.tolist())
count = pd.DataFrame(count.toarray(), columns=count_vectorizer.get_feature_names())
song_df['2nd_ref']=count.sum(axis=1).div(song_df.song_length)

Reference to third person (male)

In [None]:
vocab=['he','him','his', 'man', 'boy']

count_vectorizer = CountVectorizer(vocabulary=vocab)
count = count_vectorizer.fit_transform(song_df.lyrics.tolist())
count = pd.DataFrame(count.toarray(), columns=count_vectorizer.get_feature_names())
song_df['male_ref']=count.sum(axis=1).div(song_df.song_length)

Reference to third person (female)

In [None]:
vocab=['she','her', 'girl', 'lady', 'woman']

count_vectorizer = CountVectorizer(vocabulary=vocab)
count = count_vectorizer.fit_transform(song_df.lyrics.tolist())
count = pd.DataFrame(count.toarray(), columns=count_vectorizer.get_feature_names())
song_df['female_ref']=count.sum(axis=1).div(song_df.song_length)

## Encode Categorical Variables

In [None]:
song_df.drop(['lyrics'], axis=1, inplace=True)
song_df.fillna('?', inplace=True)

song_df['language']=le.fit_transform(song_df['language'])
song_df.head()

## Train/test Sets

In [None]:
sample_df=pd.DataFrame()
for genre in set(song_df.genre)-{'?'}:
    try:
        sample_df=sample_df.append(song_df.query("genre=='{}'".format(genre)).sample(1000))
    except:
        sample_df=sample_df.append(song_df.query("genre=='{}'".format(genre)))

train,test=train_test_split(sample_df)
print('Train',Counter(train.genre))
print('Test',Counter(test.genre))

X_train = train.drop('genre', axis=1)
y_train = train['genre']

X_test = test.drop('genre', axis=1)
y_test = test['genre']

## Decision Tree Classifier

In [None]:
dtc=DecisionTreeClassifier(random_state=0)
predicted=dtc.fit(X_train,y_train).predict(X_test) 
dict(zip(X_train.columns,list(dtc.feature_importances_)))

## Results

In [None]:
accuracy_score(y_test,predicted)

In [None]:
conf_mat=confusion_matrix(y_test,predicted,labels=list(set(y_test)))
conf_mat=pd.DataFrame(conf_mat, columns=list(set(y_test)))
conf_mat['index']=list(set(y_test))
conf_mat.set_index('index', inplace=True)

plt.figure(figsize=(7,7))
plt.pcolor(conf_mat, cmap='OrRd')
labels=list(set(y_test))
plt.yticks(np.arange(0.5, len(labels), 1), labels)
plt.xticks(np.arange(0.5, len(labels), 1), labels, rotation=45)
plt.title('Proportional Confusion Matrix Heatmap')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.colorbar()
plt.savefig('../Figures/conf_heatmap_combined.pdf')
plt.show()

In [None]:
cross_val_score(dtc,X_train,y_train) #3-fold cross validation