In [53]:
# analysis of Taylor Swift lyrics using NMF
import numpy as np
import pandas as pd
import seaborn as sns

import re
import string

from sklearn.feature_extraction import text
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline
np.set_printoptions(suppress=True)

In [54]:
# read data from file
tay = pd.read_csv('TaylorSwiftLyricsFeatureSet.csv')

In [55]:
tay['year_released'] = tay['track_album']
tay['year_released'] = tay['year_released'].replace(['Taylor Swift', 'Fearless', 'Speak Now', 'Red', '1989',
       'reputation', 'Lover'],[2006,2008,2010,2012,2014,2017,2019])
tay['world_sales_USD'] = tay['track_album'].replace(['Taylor Swift', 'Fearless', 'Speak Now', 'Red', '1989',
       'reputation', 'Lover'],[7000000,12000000,5500000,6000000,10500000,5000000,4000000])
song_titles = tay['track_title'].values
album = tay['track_album'].values
tay

Unnamed: 0,track_album,track_artist,track_title,danceability,energy,key,loudness,mode,speechiness,acousticness,...,valence,tempo,duration_ms,time_signature,track_uri,track_id,track_lyric,genres,year_released,world_sales_USD
0,Taylor Swift,Taylor Swift,Tim McGraw,0.580,0.491,0,-6.462,1,0.0251,0.57500,...,0.425,76.009,232107,4,spotify:track:0Om9WAB5RS09L80DyOfTNa,0Om9WAB5RS09L80DyOfTNa,He said the way my blue eyes shined\nPut those...,Country,2006,7000000
1,Taylor Swift,Taylor Swift,Picture To Burn,0.658,0.877,7,-2.098,1,0.0323,0.17300,...,0.821,105.586,173067,4,spotify:track:32mVHdy0bi1XKgr0ajsBlG,32mVHdy0bi1XKgr0ajsBlG,"State the obvious, I didn't get my perfect fan...",Country Rock,2006,7000000
2,Taylor Swift,Taylor Swift,Teardrops On My Guitar,0.621,0.417,10,-6.941,1,0.0231,0.28800,...,0.289,99.953,203040,4,spotify:track:7zMcNqs55Mxer82bvZFkpg,7zMcNqs55Mxer82bvZFkpg,Drew looks at me\nI fake a smile so he won't s...,Country Pop,2006,7000000
3,Taylor Swift,Taylor Swift,A Place In This World,0.576,0.777,9,-2.881,1,0.0324,0.05100,...,0.428,115.028,199200,4,spotify:track:73OX8GdpOeGzKC6OvGSbsv,73OX8GdpOeGzKC6OvGSbsv,"I don't know what I want, so don't ask me\nCau...",Country Pop,2006,7000000
4,Taylor Swift,Taylor Swift,Cold as You,0.418,0.482,5,-5.769,1,0.0266,0.21700,...,0.261,175.558,239013,4,spotify:track:7an1exwMnfYRcdVQm0yDev,7an1exwMnfYRcdVQm0yDev,You have a way of coming easily to me\nAnd whe...,Country,2006,7000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,Lover,Taylor Swift,Miss Americana & the Heartbreak Prince,0.662,0.747,11,-6.926,0,0.0736,0.02800,...,0.487,150.088,234147,4,spotify:track:214nt20w5wOxJnY462klLw,214nt20w5wOxJnY462klLw,"You know I adore you, I'm crazier for you\nTha...",,2019,4000000
107,Lover,Taylor Swift,Paper Rings,0.811,0.719,9,-6.553,1,0.0497,0.01290,...,0.865,103.979,222400,4,spotify:track:4y5bvROuBDPr5fuwXbIBZR,4y5bvROuBDPr5fuwXbIBZR,The moon is high\nLike your friends were the n...,,2019,4000000
108,Lover,Taylor Swift,Soon You'll Get Better (feat. Dixie Chicks),0.433,0.182,0,-12.566,1,0.0641,0.90700,...,0.421,207.476,201587,4,spotify:track:4AYtqFyFbX0Xkc2wtcygTr,4AYtqFyFbX0Xkc2wtcygTr,The buttons of my coat were tangled in my hair...,,2019,4000000
109,Lover,Taylor Swift,The Archer,0.292,0.574,0,-9.375,1,0.0401,0.12000,...,0.166,124.344,211240,4,spotify:track:3pHkh7d0lzM2AldUtz2x37,3pHkh7d0lzM2AldUtz2x37,Combat\nI'm ready for combat\nI say I don't wa...,Dream Pop,2019,4000000


In [56]:
# clean data before applying NMF
# remove \n
n = lambda x: re.sub('\n',' ',x)
alpha = lambda x: re.sub('\w*\d\w*', ' ', x)
# remove punctuation and make all lyrics lowercase
lowercase = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

tay['track_lyric'] = tay.track_lyric.map(n).map(lowercase).map(alpha)

# data after cleaning
tay.track_lyric

0      he said the way my blue eyes shined put those ...
1      state the obvious  i didn t get my perfect fan...
2      drew looks at me i fake a smile so he won t se...
3      i don t know what i want  so don t ask me caus...
4      you have a way of coming easily to me and when...
                             ...                        
106    you know i adore you  i m crazier for you than...
107    the moon is high like your friends were the ni...
108    the buttons of my coat were tangled in my hair...
109    combat i m ready for combat i say i don t want...
110    you are somebody that i don t know but you re ...
Name: track_lyric, Length: 111, dtype: object

In [57]:
# remove words that we don't want to include in analysis 
all_remove = ['just','don','gonna','cause','ll','ve','got','oh','eh','aah','want','way','away','ooh','wanna','ain','hey']
remove = text.ENGLISH_STOP_WORDS.union(all_remove)

In [58]:
tfidf_v = TfidfVectorizer(stop_words=remove,min_df=0.1)
tfidf_freq = tfidf_v.fit_transform(tay.track_lyric)

InvalidParameterError: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got frozenset({'before', 'of', 'moreover', 'whether', 'get', 'cannot', 'among', 'aah', 'below', 'so', 'although', 'whenever', 'or', 'made', 'hasnt', 'myself', 'former', 'who', 'less', 'his', 'ltd', 'eg', 'done', 'hereupon', 'front', 'thus', 'one', 'whereupon', 'them', 'our', 'none', 'forty', 'full', 'want', 'two', 'few', 'whatever', 'me', 'formerly', 'they', 'should', 'well', 'whole', 'the', 'move', 'sixty', 'most', 'still', 'serious', 'onto', 'itself', 'am', 'how', 'each', 'only', 'via', 'that', 'to', 'some', 'bill', 'enough', 'if', 'least', 'herself', 'anyhow', 'either', 'go', 'cant', 'from', 'since', 'been', 'other', 'might', 'by', 'everything', 'indeed', 'nobody', 'beforehand', 'about', 'these', 'take', 'nor', 'third', 'were', 'ie', 'de', 'once', 'everyone', 'wherein', 'anyone', 'around', 'side', 'us', 'when', 'becoming', 'yours', 'whence', 'someone', 'within', 'hereby', 'whereas', 'empty', 'six', 'ain', 'thereafter', 'bottom', 'therefore', 'more', 'will', 'further', 'him', 'name', 'both', 'down', 'therein', 'fifteen', 'sincere', 'hers', 'latterly', 'etc', 've', 'back', 'eleven', 'ever', 'somewhere', 'else', 'toward', 'may', 'yet', 'mine', 'during', 'inc', 'your', 'many', 'mostly', 'seeming', 'too', 'why', 'can', 'amoungst', 'not', 'co', 'three', 'twelve', 'hey', 'same', 'something', 'she', 'top', 'eight', 'nowhere', 'nine', 'have', 'there', 'besides', 'along', 'own', 'you', 'any', 'five', 'thin', 'its', 'see', 'latter', 'namely', 'sometime', 'anything', 'whom', 'away', 'll', 'others', 'way', 'hundred', 'con', 'hereafter', 'thick', 'being', 'rather', 'part', 'seems', 'across', 'elsewhere', 'per', 'do', 'perhaps', 'out', 'it', 'beyond', 'whither', 'find', 'whereafter', 'fill', 'which', 'through', 'a', 'what', 'due', 'would', 'upon', 'couldnt', 'my', 'every', 'even', 'such', 'almost', 'be', 'because', 'several', 'for', 'has', 'could', 'at', 'in', 're', 'yourselves', 'keep', 'with', 'thereby', 'he', 'then', 'eh', 'seemed', 'after', 'also', 'give', 'as', 'ours', 'an', 'neither', 'first', 'amount', 'on', 'above', 'their', 'next', 'i', 'and', 'here', 'describe', 'than', 'much', 'very', 'where', 'wanna', 'thence', 'must', 'always', 'hence', 'without', 'anywhere', 'sometimes', 'meanwhile', 'last', 'otherwise', 'those', 'alone', 'cry', 'no', 'please', 'seem', 'is', 'system', 'themselves', 'into', 'already', 'interest', 'become', 'mill', 'are', 'ourselves', 'another', 'this', 'however', 'somehow', 'himself', 'never', 'afterwards', 'between', 'whose', 'becomes', 'fifty', 'became', 'found', 'noone', 'four', 'all', 'don', 'herein', 'cause', 'put', 'we', 'thru', 'un', 'call', 'beside', 'under', 'up', 'over', 'while', 'nevertheless', 'her', 'just', 'now', 'show', 'wherever', 'ten', 'was', 'ooh', 'yourself', 'throughout', 'got', 'often', 'against', 'again', 'twenty', 'whereby', 'together', 'detail', 'everywhere', 'until', 'whoever', 'behind', 'amongst', 'fire', 'though', 'nothing', 'towards', 'except', 'off', 'thereupon', 'but', 'gonna', 'had', 'oh', 'anyway'}) instead.

In [None]:
# TF-IDF vectorizer
cv = TfidfVectorizer(stop_words='english',min_df=0.1,max_df=0.7)
x = cv.fit_transform(tay.track_lyric).toarray()
df = pd.DataFrame(x,columns=cv.get_feature_names())

df

In [None]:
# line plot of the eigenvalues of components
lsa = TruncatedSVD(100)
lsa.fit(df)
lsa_features = lsa.transform(df)
plt.figure()
plt.plot(lsa.explained_variance_ratio_)
xvals = np.linspace(0, 99)
yvals = [0.025 for x in xvals]
plt.plot(xvals,yvals)
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance')
plt.title('Scree Plot TF-IDF')

In [None]:
# line plot of the eigenvalues of components
lsa = TruncatedSVD(10)
lsa.fit(df)
lsa_features = lsa.transform(df)
plt.figure()
plt.plot(lsa.explained_variance_ratio_)
plt.xlabel('Number of Components')
plt.ylabel('Explained ')
plt.title('Scree Plot TF-IDF')

In [None]:
def print_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
# apply NMF
def nmf_function(num_components, matrix, vectorizer):
    nmf = NMF(num_components)
    topic = nmf.fit_transform(matrix)
    
    index = []
    for i in range(num_components):
        index.append(i)
    topic_word = pd.DataFrame(nmf.components_.round(3),
             index = index,
             columns = vectorizer.get_feature_names())
    
    print(print_topics(nmf, vectorizer.get_feature_names(), 15))
    return topic_word

In [None]:
# apply NMF
nmf_function(9,df,cv)

In [None]:
nmf_function(9,tfidf_freq,tfidf_v)

In [None]:
topic_labels = ['Permanency, belonging','New things','Light, love',
                'Trying, moving on','Staying, not moving on','Missing something, sad',
                'Regret, remorse','New relationship','Contemplation, remembering']

In [None]:
# get H matrix
def nmf_HMatrix(num_components, doc_text_matrix, vectorizer):
    nmf = NMF(num_components)
    doc_topic = nmf.fit_transform(doc_text_matrix)
    
    idx = []
    for i in range(num_components):
        idx.append(i) 
    H = pd.DataFrame(doc_topic.round(3),
                    index = song_titles,
                    columns = idx)
    return H

h9 = nmf_HMatrix(3,df,cv)
h9

In [None]:
# cosine similarity 
doc_similarity_matrix9 = pd.DataFrame(cosine_similarity(h9))
doc_similarity_matrix9.head(15)

In [None]:
doc_similarity_matrix9[0].nlargest(5)

In [None]:
tay.track_title[0],tay.track_title[83],tay.track_title[93],tay.track_title[58],tay.track_title[14]

In [None]:
doc_similarity_matrix9[13].nlargest(5)

In [None]:
tay.track_title[44],tay.track_title[55],tay.track_title[73]

In [None]:
# k clustering on NMF (using TF-IDF)
kmeans9 = KMeans(n_clusters=5,random_state=555)
clustering_ori9 = kmeans9.fit_predict(h9)

In [None]:
kmeans9.cluster_centers_

Clusters with Relation to Topics (based on cluster_centers_)
Cluster 0: Mostly about Topic 3
Cluster 1: Mostly about Topic 5
Cluster 2: Mostly about Topics 2 and 4
Cluster 3: Mostly about Topics 6, 8, 7, and 0
Cluster 4: Mostly about Topic 1. Some Topic 5 and 2.

In [None]:
# visulaizing clusters with TSNE
dbscan9 = DBSCAN(eps=0.2,min_samples=3).fit(h9)
Xtsne9 = TSNE(n_components=2,random_state=467).fit_transform(h9)
dftsne9 = pd.DataFrame(Xtsne9)
dftsne9['cluster'] = clustering_ori9
dftsne9.columns = ['x1','x2','cluster']
dftsne2d9 = dftsne9
dftsne2d9['cluster'] = clustering_ori9

tay_df = pd.concat([tay,dftsne2d9],axis=1)
tay_df.to_csv(r'full_df2.csv',index=False, header=True)

In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(data=dftsne2d9,x='x1',y='x2',hue='cluster',legend="full",alpha=0.7)
plt.title('Visualized on TSNE 9 topics')
