In [30]:
# analysis of Taylor Swift lyrics using SVD
import numpy as np
import pandas as pd

import re
import string

from sklearn.feature_extraction import text
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline
np.set_printoptions(suppress=True)

In [31]:
# read data from file
tay = pd.read_csv('TaylorSwiftLyricsFeatureSet.csv')

In [32]:
tay['year_released'] = tay['track_album']
tay['year_released'] = tay['year_released'].replace(['Taylor Swift', 'Fearless', 'Speak Now', 'Red', '1989',
       'reputation', 'Lover'],[2006,2008,2010,2012,2014,2017,2019])
tay['world_sales_USD'] = tay['track_album'].replace(['Taylor Swift', 'Fearless', 'Speak Now', 'Red', '1989',
       'reputation', 'Lover'],[7000000,12000000,5500000,6000000,10500000,5000000,4000000])
song_titles = tay['track_title'].values

In [33]:
# clean data before applying NMF
# remove \n
n = lambda x: re.sub('\n',' ',x)
alpha = lambda x: re.sub('\w*\d\w*', ' ', x)
# remove punctuation and make all lyrics lowercase
lowercase = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

tay['track_lyric'] = tay.track_lyric.map(n).map(lowercase).map(alpha)

# data after cleaning
tay.track_lyric[0]

'he said the way my blue eyes shined put those georgia stars to shame that night i said   that s a lie  just a boy in a chevy truck that had a tendency of gettin  stuck on backroads at night and i was right there beside him all summer long and then the time we woke up to find that summer gone but when you think tim mcgraw i hope you think my favorite song the one we danced to all night long the moon like a spotlight on the lake when you think happiness i hope you think that little black dress think of my head on your chest and my old faded blue jeans when you think tim mcgraw i hope you think of me september saw a month of tears and thankin  god that you weren t here to see me like that but in a box beneath my bed is a letter that you never read from three summers back it s hard not to find it all a little bittersweet and lookin  back on all of that  it s nice to believe when you think tim mcgraw i hope you think my favorite song the one we danced to all night long the moon like a spot

In [34]:
# remove words that we don't want to include in analysis 
stop_words = ['just','don','gonna','cause','ll','ve','got','oh','eh','aah','want','way','away','ooh','wanna','ain','yeah','hey', 'did']
remove = text.ENGLISH_STOP_WORDS.union(stop_words)


In [35]:
# TF-IDF vectorizer
vectorizer = CountVectorizer(stop_words=remove,min_df=0.1,max_df=0.7)
V = vectorizer.fit_transform(tay.track_lyric)
V = pd.DataFrame(V.toarray(), columns=vectorizer.get_feature_names())
vocab = np.array(vectorizer.get_feature_names())

InvalidParameterError: The 'stop_words' parameter of CountVectorizer must be a str among {'english'}, an instance of 'list' or None. Got frozenset({'bottom', 'many', 'oh', 'their', 'before', 'via', 'nothing', 'eg', 'cause', 'thereupon', 'an', 'describe', 'found', 'way', 'sixty', 'sincere', 'and', 'fifty', 'two', 'fifteen', 'they', 'from', 'at', 'a', 'others', 'beforehand', 'eight', 'per', 'we', 'one', 'become', 'de', 'someone', 'hereafter', 'there', 'twenty', 'only', 'forty', 'below', 'when', 'same', 'throughout', 'both', 'name', 'this', 'thru', 'former', 'through', 'upon', 'call', 'up', 'detail', 'ourselves', 'empty', 'anyway', 'still', 'which', 'more', 'mostly', 'co', 'again', 'beside', 'whence', 'who', 'not', 'them', 'whether', 'ooh', 'because', 'being', 'none', 'these', 'ltd', 'six', 'hence', 'latter', 'although', 're', 'within', 'also', 'either', 'neither', 'but', 'do', 'off', 'see', 'should', 'were', 'my', 'own', 'therein', 'seem', 'whom', 'got', 'much', 'yourself', 'itself', 'serious', 'was', 'everything', 'therefore', 'last', 'him', 'seemed', 'between', 'hereupon', 'often', 'however', 'myself', 'out', 'for', 'latterly', 'hundred', 'except', 'hereby', 'elsewhere', 'go', 'could', 'together', 'twelve', 'front', 'than', 'am', 'next', 'of', 'top', 'want', 'nine', 've', 'mill', 'those', 'otherwise', 'been', 'into', 'moreover', 'to', 'sometime', 'every', 'nobody', 'nevertheless', 'full', 'formerly', 'may', 'thin', 'be', 'show', 'himself', 'fire', 'since', 'back', 'where', 'less', 'whose', 'whenever', 'please', 'five', 'just', 'keep', 'too', 'might', 'while', 'give', 'amoungst', 'interest', 'gonna', 'whereby', 'your', 'further', 'amongst', 'had', 'now', 'along', 'ever', 'became', 'whereupon', 'other', 'down', 'never', 'part', 'perhaps', 'its', 'side', 'onto', 'even', 'noone', 'namely', 'inc', 'whereafter', 'everywhere', 'is', 'indeed', 'anything', 'another', 'each', 'don', 'nowhere', 'anywhere', 'll', 'cannot', 'have', 'seems', 'after', 'under', 'us', 'yours', 'will', 'here', 'take', 'system', 'though', 'thick', 'about', 'so', 'couldnt', 'move', 'already', 'can', 'whole', 'our', 'somehow', 'toward', 'towards', 'beyond', 'behind', 'few', 'else', 'everyone', 'without', 'among', 'bill', 'with', 'herself', 'thereafter', 'over', 'aah', 'across', 'etc', 'herein', 'until', 'ours', 'made', 'thus', 'why', 'on', 'i', 'fill', 'his', 'several', 'alone', 'me', 'almost', 'ie', 'are', 'you', 'three', 'eleven', 'the', 'whatever', 'cry', 'must', 'well', 'whither', 'very', 'nor', 'it', 'such', 'yourselves', 'get', 'themselves', 'any', 'becomes', 'somewhere', 'ain', 'third', 'enough', 'something', 'due', 'he', 'amount', 'or', 'against', 'anyhow', 'what', 'hey', 'hers', 'by', 'anyone', 'in', 'least', 'her', 'how', 'becoming', 'find', 'during', 'as', 'eh', 'done', 'four', 'meanwhile', 'away', 'whereas', 'she', 'once', 'un', 'first', 'would', 'thence', 'wherever', 'all', 'most', 'put', 'above', 'that', 'con', 'besides', 'thereby', 'cant', 'has', 'wherein', 'rather', 'did', 'ten', 'seeming', 'mine', 'wanna', 'whoever', 'hasnt', 'no', 'yet', 'around', 'always', 'sometimes', 'then', 'some', 'if', 'afterwards', 'yeah'}) instead.

In [None]:
%time U, S, Vt = linalg.svd(V, full_matrices=False)
S

In [None]:
# singular values of s
plt.plot(S);
xvals = np.linspace(0, 99)
yvals = [25 for x in xvals]
plt.plot(xvals,yvals)
plt.xlabel('Order')
plt.ylabel('Singular Values')
plt.title('Scree Plot SVD')

In [None]:
plt.plot(S[:16]);
plt.xlabel('Order')
plt.ylabel('Singular Values')
plt.title('Scree Plot SVD')

In [None]:
# topics found using SVD
num_top_words=15;

def print_topics(v):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in v])
    return [' '.join(t) for t in topic_words]

print_topics(Vt[:5])

In [None]:
Resolving problems
Connection
Short love


Passionate, ruthless love
Remorse, regret
Night out
Nostalgia

In [None]:
topics = 
['Fights, hard feelings',
 'Growing Up, Home',
 'Dreams',
 'Remembering',
 'New good love',
 'Twisted love',
 '7',
 '8',
 'Change']