In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import glob
import re
import math

In [3]:
#Import all CSVs with lyrics

allFiles = glob.glob("*.csv")
df = pd.DataFrame()
list_ = []

for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
    
df = pd.concat(list_)
df = df.drop('Unnamed: 0', 1)

In [4]:
#Dropping if lyrics not found
df = df.replace('Lyrics not found.', float('NaN')).replace('’', '\'').replace('\n', ' ')
df = df.dropna()
df = df.reset_index()
df = df.drop('index', 1)

In [5]:
df['Lyrics'] = df['Lyrics'].str.replace('\n', ' ').str.replace('\r', ' ').str.replace('\t', ' ').str.replace('Produced By: ', '')

In [6]:
lyric_dict = df.to_dict('records')

In [7]:
for entry in lyric_dict:
    entry['Lyrics'] = re.sub("[\(\[].*?[\)\]]", "", entry['Lyrics'])
    entry['Lyrics'] = entry['Lyrics'].strip()
    entry['Lyrics'] = " ".join(entry['Lyrics'].split())

In [8]:
for entry in lyric_dict[-5:-1]:
    print(entry['Lyrics'])
    print()

My face above the water My feet can't touch the ground Touch the ground, and it feels like I can see the sands on the horizon Everytime you are not around I'm slowly drifting away Wave after wave, wave after wave I'm slowly drifting And it feels like I'm drowning Pulling against the stream Pulling against the stream I wish I could make it easy Easy to love me, love me But still I reach, to find a way I'm stuck here in between I'm looking for the right words to say I'm slowly drifting, drifting away Wave after wave, wave after wave I'm slowly drifting And it feels like I'm drowning Pulling against the stream Pulling against the stream

You know, from the moment she turn around She know, how to back it up and drop it down She know, she what all the fellas looking at Cause they know, soon as her song come on it's a wrap And she loves the attention That she get when she moves, yeah Showin' out with her home girls Hypnotizing all the dudes, oh And she knows She knows She knows I know, I kno

In [9]:
#Put lyrics into a list
text = []
for entry in lyric_dict:
    text.append(entry['Lyrics'])
    
#Put years into decades and make a list
years = []
for entry in lyric_dict:
    entry['Decade'] = math.floor(entry['Year']/10)*10
    years.append(entry['Decade'])

## Initial Modeling

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

With Stop Words

In [12]:
vectorizer = CountVectorizer()
stops = vectorizer.fit_transform(text).toarray()

# pd.DataFrame(no_stops, columns=vectorizer.get_feature_names())

lsa = TruncatedSVD(100, algorithm = 'randomized') #Truncate dimensions
stop_lsa = lsa.fit_transform(stops)
stop_lsa = Normalizer(copy=False).fit_transform(stop_lsa)

X_train, X_test, y_train, y_test = train_test_split(stop_lsa,years,test_size=.3,random_state=42)
gbm = xgb.XGBClassifier()
gbm.fit(X_train, y_train)
gbm.score(X_test, y_test)

  preds = preds.reshape(nrow, preds.size / nrow)


0.28164196123147095

No Stop Words

In [14]:
#Stop Words
vectorizer = CountVectorizer(stop_words='english',min_df=1)
no_stops = vectorizer.fit_transform(text).toarray()

# pd.DataFrame(no_stops, columns=vectorizer.get_feature_names())

lsa = TruncatedSVD(100, algorithm = 'randomized') #Truncate dimensions
no_stop_lsa = lsa.fit_transform(no_stops)
no_stop_lsa = Normalizer(copy=False).fit_transform(no_stop_lsa)

In [22]:
no_stops.shape

(5845, 23206)

In [16]:
#Random Forest
X_train, X_test, y_train, y_test = train_test_split(no_stop_lsa,years,test_size=.3,random_state=42)
model = RandomForestClassifier()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.2491448118586089

In [27]:
#XGBoost
gbm = xgb.XGBClassifier()
gbm.fit(X_train, y_train)
gbm.score(X_test, y_test)

  preds = preds.reshape(nrow, preds.size / nrow)


0.2839224629418472

Bigrams

In [17]:
#Bigrams
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),min_df=1)
bigrams = bigram_vectorizer.fit_transform(text).toarray()

# pd.DataFrame(bigrams, columns=bigram_vectorizer.get_feature_names())

lsa = TruncatedSVD(100, algorithm = 'randomized') #Truncate dimensions
bigram_lsa = lsa.fit_transform(bigrams)
bigram_lsa = Normalizer(copy=False).fit_transform(bigram_lsa)

In [23]:
bigrams.shape

(5845, 312401)

In [18]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(bigram_lsa,years,test_size=.3,random_state=42)
model = RandomForestClassifier()
model.fit(X_train2,y_train2)
model.score(X_test2,y_test2)

0.23546180159635119

In [28]:
#XGBoost
gbm = xgb.XGBClassifier()
gbm.fit(X_train2, y_train2)
gbm.score(X_test2, y_test2)

  preds = preds.reshape(nrow, preds.size / nrow)


0.2839224629418472

In [19]:
#TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(text).toarray()

lsa = TruncatedSVD(100, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

In [24]:
tfidf.shape

(5845, 23496)

In [20]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(tfidf_lsa,years,test_size=.3,random_state=42)
model = RandomForestClassifier()
model.fit(X_train3,y_train3)
model.score(X_test3,y_test3)

0.26396807297605474

In [29]:
gbm = xgb.XGBClassifier()
gbm.fit(X_train3, y_train3)
gbm.score(X_test3, y_test3)

  preds = preds.reshape(nrow, preds.size / nrow)


0.33751425313568983

Seems like TFIDF gave best score here

## Feature Union

In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [12]:
df = pd.DataFrame(lyric_dict)

decade = []
for year in df['Year']:
    decade.append(math.floor(year/10)*10)

df['Decade'] = decade

In [13]:
X = df.Lyrics
y = df.Decade
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=42)

In [14]:
class Length_Count(TransformerMixin):
    def transform(self, X, **transform_params):
        
        array = []
        for text in X:
            array.append([len(text)])
        
        return array

    def fit(self, X, y=None, **fit_params):
        return self

In [15]:
class NumCount(TransformerMixin):
    def transform(self, X, **transform_params): 
        return pd.DataFrame(X.apply(lambda x: len(re.findall(r'\d', x))))
    def fit(self, X, y=None, **fit_params):
        return self

TFIDF w/ Length

In [72]:
pipeline = Pipeline([
    ('feats', FeatureUnion([
        ("TFIDF", TfidfVectorizer()), # can pass in either a pipeline
        ('Length_Count', Length_Count())# or a transformer
    ])),
    ("xgboost", xgb.XGBClassifier())  # classifier
])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

  preds = preds.reshape(nrow, preds.size / nrow)


0.37913340935005702

TFIDF w/ length and numbers

In [88]:
xgboost = Pipeline([
    ('feats', FeatureUnion([
        ("TFIDF", TfidfVectorizer()), # can pass in either a pipeline
        ('Length_Count', Length_Count()),
        ('NumCount', NumCount())# or a transformer
    ])),
    ("xgboost", xgb.XGBClassifier())  # classifier
])

xgboost.fit(X_train, y_train)
xgboost.score(X_test, y_test)

  preds = preds.reshape(nrow, preds.size / nrow)


0.38369441277080957

# Multinomial Logistic Regression
### Coding:
0: 1950's    1: 1960's    2:1970's    3:1980's    4:1990's    5:2000's    6:2010's

In [16]:
target = []
for entry in df['Decade']:
    if entry == 1950:
        target.append(0)
    elif entry == 1960:
        target.append(1)
    elif entry == 1970:
        target.append(2)
    elif entry == 1980:
        target.append(3)
    elif entry == 1990:
        target.append(4)
    elif entry == 2000:
        target.append(5)
    elif entry == 2010:
        target.append(6)

df['Target'] = target

df['50s'] = df['Decade'] == 1950
df['60s'] = df['Decade'] == 1960
df['70s'] = df['Decade'] == 1970
df['80s'] = df['Decade'] == 1980
df['90s'] = df['Decade'] == 1990
df['00s'] = df['Decade'] == 2000
df['10s'] = df['Decade'] == 2010

In [17]:
df.head()

Unnamed: 0,Artist,Decade,Lyrics,Title,Year,Target,50s,60s,70s,80s,90s,00s,10s
0,Gordon Jenkins,1950,"Gordon Jenkins Miscellaneous Goodnight, Irene ...",Goodnight Irene,1950,0,True,False,False,False,False,False,False
1,Nat King Cole,1950,"Mona Lisa, Mona Lisa, men have named you You'r...",Mona Lisa,1950,0,True,False,False,False,False,False,False
2,Teresa Brewer,1950,"Music, music, music So put another nickel in I...","Music, Music, Music",1950,0,True,False,False,False,False,False,False
3,Guy Lombardo,1950,When a zither starts to play You'll remember y...,Third Man Theme,1950,0,True,False,False,False,False,False,False
4,Red Foley,1950,Red Foley Miscellaneous Chattanoogie Shoe-shin...,Chattanoogie Shoe Shine Boy,1950,0,True,False,False,False,False,False,False


## CSV The DataFrame

In [321]:
df.to_csv('LyricsData.csv')

In [324]:
df2 = patients = pd.read_csv("LyricsData.csv", index_col=0)
df2

Unnamed: 0,Artist,Decade,Lyrics,Title,Year,Target,50s,60s,70s,80s,90s,00s,10s
0,Gordon Jenkins,1950,"Gordon Jenkins Miscellaneous Goodnight, Irene ...",Goodnight Irene,1950,0,True,False,False,False,False,False,False
1,Nat King Cole,1950,"Mona Lisa, Mona Lisa, men have named you You'r...",Mona Lisa,1950,0,True,False,False,False,False,False,False
2,Teresa Brewer,1950,"Music, music, music So put another nickel in I...","Music, Music, Music",1950,0,True,False,False,False,False,False,False
3,Guy Lombardo,1950,When a zither starts to play You'll remember y...,Third Man Theme,1950,0,True,False,False,False,False,False,False
4,Red Foley,1950,Red Foley Miscellaneous Chattanoogie Shoe-shin...,Chattanoogie Shoe Shine Boy,1950,0,True,False,False,False,False,False,False
5,Sammy Kaye,1950,I saw the harbor lights They only told me we w...,Harbor Lights,1950,0,True,False,False,False,False,False,False
6,Kay Starr,1950,Met the man I love In a town way down in Dixie...,Bonaparte’s Retreat,1950,0,True,False,False,False,False,False,False
7,Tony Martin,1950,There's no tomorrow when love is new Now is fo...,There’s No Tomorrow,1950,0,True,False,False,False,False,False,False
8,Phil Harris,1950,While I was walking down the beach one bright ...,The Thing,1950,0,True,False,False,False,False,False,False
9,Andrews Sisters,1950,I wanna be loved with inspiration I wanna be l...,I Wanna Be Loved,1950,0,True,False,False,False,False,False,False


In [253]:
pd.DataFrame(X.apply(lambda x: len(re.findall(r'love', x))))


1    Mona Lisa, Mona Lisa, men have named you You'r...
Name: Lyrics, dtype: object

In [213]:
from gensim.models import Word2Vec

In [255]:
from sklearn.base import TransformerMixin

class LengthTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        lengths = pd.DataFrame(X.apply(lambda x: len(x.split())))
        return lengths
    def fit(self, X, y=None, **fit_params):
        return self
    
class CapTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        cap_lengths = pd.DataFrame(X.apply(lambda x: len([i for i in x.split() if i[0].isupper()])))
        return cap_lengths
    def fit(self, X, y=None, **fit_params):
        return self
    
class NumCount(TransformerMixin):
    def transform(self, X, **transform_params): 
        return pd.DataFrame(X.apply(lambda x: len(re.findall(r'\d', x))))
    def fit(self, X, y=None, **fit_params):
        return self

class ApostropheCount(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.DataFrame(X.apply(lambda x: len(re.findall(r'\'', x))))
    def fit(self, X, y=None, **fit_params):
        return self
    
class LoveCount(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.DataFrame(X.apply(lambda x: len(re.findall(r'love', x))))
    def fit(self, X, y=None, **fit_params):
        return self
    
class WordVec(TransformerMixin):
    def transform(self, X, **transform_params):
        # Train word2vec
        texts = [[word for word in document.lower().split()] for document in X]
        w2v = Word2Vec(texts, size=100, window=5, min_count=1, workers=4, sg=0)
        # Make features
        def word2vec(document):
            vectors = pd.Series([w2v[word] for word in document.lower().split()]).mean()
            return pd.Series(vectors)
        df = pd.concat([word2vec(X.iloc[idx]) for idx in range(len(X))], axis=1).T
        return df
    def fit(self, X, y=None, **fit_params):
        return self


In [329]:
XTarg = df.Lyrics
yTarg = df.Target
X_trainTarg, X_testTarg, y_trainTarg, y_testTarg = train_test_split(XTarg, yTarg, test_size=.3,random_state=42)

xgboostTarg = Pipeline([
    ('feats', FeatureUnion([
        ("TFIDF", TfidfVectorizer()),
        ("CountVector", CountVectorizer()),
        ('Length', LengthTransformer()),
        ('LoveCount', LoveCount()),
        ('Caps', CapTransformer()),
        ('Apostrophes', ApostropheCount()),
        ('NumCount', NumCount())
    ])),
    ("xgboost", xgb.XGBClassifier(max_depth=5, n_estimators=1000, learning_rate=0.05))  # classifier
])

xgboostTarg.fit(X_trainTarg, y_trainTarg)
xgboostTarg.score(X_testTarg, y_testTarg)

0.42018244013683009

Best score so far: 0.4207525655644242

In [308]:
from sklearn.pipeline import make_pipeline

In [309]:
XTarg = df.Lyrics
yTarg = df.Target
X_trainTarg, X_testTarg, y_trainTarg, y_testTarg = train_test_split(XTarg, yTarg, test_size=.3,random_state=42)

tfidf_vect = TfidfVectorizer()
count_vect = CountVectorizer()
length_vect = LengthTransformer()
love_count_vect = LoveCount()
caps_vect = CapTransformer()
apos_vect = ApostropheCount()
num_vect = NumCount()

feats = FeatureUnion([
        ("TFIDF", tfidf_vect),
        ("CountVector", count_vect),
        ('Length', length_vect),
        ('LoveCount', love_count_vect),
        ('Caps', caps_vect),
        ('Apostrophes', apos_vect),
        ('NumCount', num_vect)
    ])
clf = LogisticRegression()


logregTarg = Pipeline([
    ('feats', feats),
    ("logreg", clf)  # classifier
])

logregTarg.fit(X_trainTarg, y_trainTarg)
logregTarg.score(X_testTarg, y_testTarg)

0.38996579247434437

In [310]:
joblib.dump(logregTarg, "LogReg.pkl")

['LogReg.pkl',
 'LogReg.pkl_01.npy',
 'LogReg.pkl_02.npy',
 'LogReg.pkl_03.npy',
 'LogReg.pkl_04.npy',
 'LogReg.pkl_05.npy',
 'LogReg.pkl_06.npy']

In [304]:
#Pickle Model
from sklearn.externals import joblib

In [305]:
joblib.dump(xgboostTarg, "LyricPredictor.pkl")

['LyricPredictor.pkl',
 'LyricPredictor.pkl_01.npy',
 'LyricPredictor.pkl_02.npy',
 'LyricPredictor.pkl_03.npy']

## Components per decade

In [131]:
#1950s
df_50s = df.loc[df['50s'] == True]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df_50s.Lyrics).toarray()

lsa = TruncatedSVD(5, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

lsa.explained_variance_ratio_

array([ 0.00932984,  0.01045777,  0.00984829,  0.00859546,  0.00816747])

In [132]:
components = pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2","component_3","component_4","component_5"],columns = vectorizer.get_feature_names())
components[0:1]

Unnamed: 0,000,01,05,05471992,10,100,1004,105,10th,11,...,zone,zu,zula,zurück,zzzzzzoooooooooooommmmmmmmmmmm,½now,½son,½whoa,ça,être
component_1,2e-05,0.00126,0.00066,0.00108,0.00132,0.00092,0.00031,0.00474,0.00033,0.00141,...,0.00063,0.0,0.00127,0.0,0.00032,0.00049,0.00072,0.00072,3e-05,3e-05


In [137]:
comp = []
for i in range(len(components.columns)):
    comp.append((components[4:5].values[0][i], components.columns[i]))
    
sorted(comp, key=lambda x: x[0], reverse=True)

[(0.33115, 'oh'),
 (0.30854999999999999, 'll'),
 (0.22917999999999999, 'tell'),
 (0.16208, 'know'),
 (0.13084999999999999, 'day'),
 (0.1203, 'ooh'),
 (0.097879999999999995, 'wah'),
 (0.088919999999999999, 'walk'),
 (0.085139999999999993, 'darling'),
 (0.080860000000000001, 'wait'),
 (0.074429999999999996, 'lonely'),
 (0.06948, 'fall'),
 (0.06762, 'lost'),
 (0.061510000000000002, 'dark'),
 (0.060490000000000002, 've'),
 (0.0579, 'remember'),
 (0.051589999999999997, 'send'),
 (0.049399999999999999, 'gone'),
 (0.047980000000000002, 'darlin'),
 (0.045969999999999997, 'glow'),
 (0.04546, 'fool'),
 (0.044670000000000001, 'home'),
 (0.042419999999999999, 'time'),
 (0.040620000000000003, 'moon'),
 (0.04045, 'does'),
 (0.03977, 'away'),
 (0.039730000000000001, 'prayer'),
 (0.038199999999999998, 'sincerely'),
 (0.0361, 'fools'),
 (0.035540000000000002, 'uh'),
 (0.035450000000000002, 'lonesome'),
 (0.035380000000000002, 'way'),
 (0.035360000000000003, 'close'),
 (0.0349, 'die'),
 (0.0348800000000

1: you, the, me, and, my     2: you, me, ll, love, please, true, be    3: instrumental, melody, your, yours, mine, of    4: why, oh, baby, tell, oooh, wah   5: why, your love, heart, of

No stop words

1: love, 'll, know, oh, heart, don'    2: instrumental, love, melody, true, heart, butterfly   3: baby, oh, yeah, ooh, don'    4: oh, love, ooh, wah, fall, fool   5: oh, 'll, tell, know, day, ooh, wah

In [138]:
#1960s
df_60s = df.loc[df['60s'] == True]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df_60s.Lyrics).toarray()

lsa = TruncatedSVD(5, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

lsa.explained_variance_ratio_

array([ 0.01105556,  0.02049932,  0.01221918,  0.00931959,  0.0090708 ])

In [139]:
components = pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2","component_3", "component_4", "component_5"],columns = vectorizer.get_feature_names())

In [145]:
comp = []
for i in range(len(components.columns)):
    comp.append((components[4:5].values[0][i], components.columns[i]))
    
sorted(comp, key=lambda x: x[0], reverse=True)

[(0.82606000000000002, 'la'),
 (0.13775000000000001, 'oh'),
 (0.11312999999999999, 'come'),
 (0.075569999999999998, 'sha'),
 (0.070319999999999994, 'girl'),
 (0.066739999999999994, 'lyrics'),
 (0.065320000000000003, 'little'),
 (0.065110000000000001, 'da'),
 (0.055149999999999998, 'yeah'),
 (0.054390000000000001, 'man'),
 (0.054179999999999999, 'woman'),
 (0.053900000000000003, 'uh'),
 (0.043720000000000002, 'got'),
 (0.043279999999999999, 'jean'),
 (0.041459999999999997, 'shangri'),
 (0.041439999999999998, 'days'),
 (0.038780000000000002, 'dance'),
 (0.037379999999999997, 'today'),
 (0.037310000000000003, 'right'),
 (0.03721, 'friend'),
 (0.036170000000000001, 'home'),
 (0.035450000000000002, 'whoa'),
 (0.03397, 'day'),
 (0.033779999999999998, 'cried'),
 (0.033410000000000002, 'huh'),
 (0.032969999999999999, 'time'),
 (0.032870000000000003, 'place'),
 (0.032689999999999997, 'young'),
 (0.03218, 'br'),
 (0.03193, 'away'),
 (0.031789999999999999, 'ya'),
 (0.0315, 'sound'),
 (0.03109, 't

1: you, the, me, and, my, to, it     2: instrumental, scratch, baby, br, good, little    3: you, love, me, baby, don, need    4: lyrics, yet, love, we, not, for    5: baby, lyrics, yet, do, not, have

No Stop Words

1: love, baby, don', oh, ll, know    2: instrumental, scratch, br, good, troubles, kicks    3: baby, yeah, come, got, twist    4: baby, la, love, need, angel    5: la, oh, come, sha, girl

In [146]:
#1970s
df_70s = df.loc[df['70s'] == True]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df_70s.Lyrics).toarray()

lsa = TruncatedSVD(5, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

lsa.explained_variance_ratio_

array([ 0.00955456,  0.01513386,  0.01103329,  0.0094408 ,  0.00900587])

In [147]:
components = pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2","component_3","component_4","component_5"],columns = vectorizer.get_feature_names())
components[0:1]

Unnamed: 0,05,10,11,13,14,15,17,185,19,1951,...,½oh,½please,½yeah,½you,áhr,ãi,éhr,êhr,ôhr,ôi
component_1,0.00045,0.00116,0.00075,0.00012,0.00075,0.00019,0.00137,0.00072,4e-05,0.00048,...,0.00208,0.00051,0.00065,0.00073,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
comp = []
for i in range(len(components.columns)):
    comp.append((components[4:5].values[0][i], components.columns[i]))
    
sorted(comp, key=lambda x: x[0], reverse=True)

[(0.78002000000000005, 'baby'),
 (0.19822999999999999, 'dance'),
 (0.094189999999999996, 'boogie'),
 (0.086269999999999999, 'come'),
 (0.083089999999999997, 'oh'),
 (0.068390000000000006, 'lovin'),
 (0.067000000000000004, 'doin'),
 (0.066189999999999999, 'girl'),
 (0.061789999999999998, 'ooh'),
 (0.061269999999999998, 'yeah'),
 (0.048619999999999997, 'wanna'),
 (0.046600000000000003, 'want'),
 (0.042389999999999997, 'shake'),
 (0.040910000000000002, 'arms'),
 (0.04002, 'rock'),
 (0.03984, 'dancing'),
 (0.035159999999999997, 'worry'),
 (0.030349999999999999, 'fine'),
 (0.0298, 'beach'),
 (0.029350000000000001, 'future'),
 (0.029059999999999999, 'fallin'),
 (0.028219999999999999, 'wonderful'),
 (0.028129999999999999, 'let'),
 (0.02691, 'hot'),
 (0.026849999999999999, 'tonight'),
 (0.026710000000000001, 'hey'),
 (0.02623, 'sweet'),
 (0.024240000000000001, 'hooked'),
 (0.022020000000000001, 'hoo'),
 (0.02053, 'satisfy'),
 (0.019480000000000001, 'ya'),
 (0.019369999999999998, 'winner'),
 (0

1: you, the, to, me, and, love     2: instrumental, wild, you, thing, groovy  3: you, love, baby, me, do, want    4: la, baby, na, sing, ooh, da   5: she, her, baby, me, my, love, woman

no stop words

1: love, baby, don', oh, know    2: instrumental, wild, thing, groovy, make    3: la, baby, sing, na, hot, da sha    4: love, la, fall, fallin'    5: baby, dance, boogie, come

In [153]:
#1980s
df_80s = df.loc[df['80s'] == True]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df_80s.Lyrics).toarray()

lsa = TruncatedSVD(5, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

lsa.explained_variance_ratio_

array([ 0.00995029,  0.01016244,  0.00816092,  0.00788759,  0.00704507])

In [154]:
components = pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2","component_3","component_4","component_5"],columns = vectorizer.get_feature_names())
components[0:1]

Unnamed: 0,05,10,12,15,16th,1780,18,1963,1983,1987,...,zone,zoom,zoomin,zu,zum,zwei,½can,½i,½oh,½that
component_1,0.00029,0.00077,0.00071,0.00015,0.00043,1e-05,0.00195,1e-05,0.00069,0.00027,...,0.00163,0.00017,0.00232,1e-05,1e-05,0.0002,0.00104,0.00012,0.00072,0.00104


In [159]:
comp = []
for i in range(len(components.columns)):
    comp.append((components[4:5].values[0][i], components.columns[i]))
    
sorted(comp, key=lambda x: x[0], reverse=True)

[(0.46731, 'gonna'),
 (0.38407000000000002, 'wanna'),
 (0.28344999999999998, 'baby'),
 (0.23208999999999999, 'tonight'),
 (0.14147000000000001, 'let'),
 (0.13625999999999999, 'got'),
 (0.13095999999999999, 'yeah'),
 (0.12368, 'somebody'),
 (0.11992, 'love'),
 (0.090160000000000004, 'dance'),
 (0.078710000000000002, 'ooh'),
 (0.064689999999999998, 'make'),
 (0.06037, 'rock'),
 (0.057660000000000003, 'everybody'),
 (0.049770000000000002, 'gotta'),
 (0.046449999999999998, 'really'),
 (0.046039999999999998, 'use'),
 (0.0458, 'ain'),
 (0.043650000000000001, 'na'),
 (0.041680000000000002, 'lose'),
 (0.041090000000000002, 'right'),
 (0.036409999999999998, 'feelin'),
 (0.035060000000000001, 'body'),
 (0.034750000000000003, 'set'),
 (0.032939999999999997, 'drive'),
 (0.031469999999999998, 'celebrate'),
 (0.0309, 'cause'),
 (0.030519999999999999, 'honey'),
 (0.02972, 'slow'),
 (0.029180000000000001, 'heartache'),
 (0.028910000000000002, 'cool'),
 (0.028070000000000001, 'man'),
 (0.02760999999999

1: you, the, to, me, it, love    2: the, she, we, her of, in    3: she, her, you, girl, want, do, me    4: love, we, she, this, my, ve    5: oh, it, baby, love, don', wanna

no stop words

1: love, don', know, baby, oh    2: love, ve, ooh, true, bring   3: oh, yeah, little, gonna, baby, girl, tonight   4: don', oh, want, baby, wanna    5: gonna, wanna, baby, tonight, let

In [122]:
#1990s
df_90s = df.loc[df['90s'] == True]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df_90s.Lyrics).toarray()

lsa = TruncatedSVD(5, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

lsa.explained_variance_ratio_

array([ 0.01183453,  0.01172155,  0.00911093,  0.0080742 ,  0.00783685])

In [123]:
components = pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2","component_3","component_4","component_5"],columns = vectorizer.get_feature_names())
components[0:1]

Unnamed: 0,000,03,10,100,1000,105,11,110,112,12,...,½baby,½fineâ,½i,½opposites,½that,½this,½what,½where,½who,ôem
component_1,0.00038,6e-05,0.00114,0.00013,4e-05,8e-05,0.00128,8e-05,0.00198,5e-05,...,0.00057,0.00099,0.00099,0.0001,0.0009,0.00071,0.00094,1e-05,1e-05,0.00011


In [129]:
comp = []
for i in range(len(components.columns)):
    comp.append((components[4:5].values[0][i], components.columns[i]))
    
sorted(comp, key=lambda x: x[0], reverse=True)

[(0.42420999999999998, 'll'),
 (0.40600000000000003, 'want'),
 (0.25448999999999999, 'baby'),
 (0.17493, 'need'),
 (0.13618, 'don'),
 (0.079060000000000005, 'hold'),
 (0.070709999999999995, 'girl'),
 (0.065049999999999997, 'forever'),
 (0.052150000000000002, 'stay'),
 (0.040719999999999999, 'swear'),
 (0.038600000000000002, 'tight'),
 (0.037199999999999997, 'tonight'),
 (0.036339999999999997, 'promise'),
 (0.0361, 'tell'),
 (0.035650000000000001, 'till'),
 (0.030890000000000001, 'know'),
 (0.030710000000000001, 'make'),
 (0.029669999999999998, 'wish'),
 (0.02928, 'yea'),
 (0.02681, 'feelings'),
 (0.026759999999999999, 'hurt'),
 (0.02664, 'treat'),
 (0.026460000000000001, 'boy'),
 (0.025239999999999999, 'stand'),
 (0.023560000000000001, 'holding'),
 (0.02257, 'taste'),
 (0.021850000000000001, 'soon'),
 (0.02145, 'away'),
 (0.020209999999999999, 'live'),
 (0.020080000000000001, 'turn'),
 (0.01976, 'minute'),
 (0.019120000000000002, 'loved'),
 (0.019050000000000001, 'strength'),
 (0.01904

1: you, the, to, me, and, it    2: the, and, she, we, of, it, in    3: it, do, get, me, baby, wanna, yeah, up    4: she, her, you, wants, woman, my, was, girl, gone    5: do, the, anything, remember

No stop words

1: love, baby, don', 'll, know, want    2: love, 'll, heart, forever, believe, remember, gave    3: baby, yeah, love, oh, body, girl    4: wanna, love, don', baby, really, miss    5: 'll want, baby, need, don, hold, girl, forever

In [167]:
#2000s
df_00s = df.loc[df['00s'] == True]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df_00s.Lyrics).toarray()

lsa = TruncatedSVD(5, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

lsa.explained_variance_ratio_

array([ 0.0070239 ,  0.01245605,  0.00999609,  0.00955855,  0.00779036])

In [168]:
components = pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2","component_3","component_4","component_5"],columns = vectorizer.get_feature_names())
components[0:1]

Unnamed: 0,000,02,03,06,07,0z,10,100,105,106,...,½nothing,½only,½ooh,½stop,½that,½they,½what,½why,½yee,½you
component_1,0.00099,0.00061,0.00086,0.00043,0.00011,0.0001,0.00428,0.00174,0.00047,0.0007,...,0.00038,0.00042,0.0002,0.00024,0.00126,0.00045,0.00064,0.0007,0.00082,0.00078


In [174]:
comp = []
for i in range(len(components.columns)):
    comp.append((components[2:3].values[0][i], components.columns[i]))
    
sorted(comp, key=lambda x: x[0], reverse=True)

[(0.78754999999999997, 'na'),
 (0.21446999999999999, 'da'),
 (0.10367, 'like'),
 (0.10353, 'ya'),
 (0.092969999999999997, 'got'),
 (0.088480000000000003, 'shit'),
 (0.074859999999999996, 'shake'),
 (0.073760000000000006, 'uh'),
 (0.073639999999999997, 'hey'),
 (0.072440000000000004, 'rock'),
 (0.067809999999999995, 'wit'),
 (0.059769999999999997, 'nigga'),
 (0.054269999999999999, 'hot'),
 (0.049340000000000002, 'fuck'),
 (0.048160000000000001, 'ass'),
 (0.047870000000000003, 'yo'),
 (0.047169999999999997, 'club'),
 (0.046969999999999998, 'ain'),
 (0.045789999999999997, 'em'),
 (0.044749999999999998, 'gon'),
 (0.044409999999999998, 'money'),
 (0.041610000000000001, 'girl'),
 (0.041050000000000003, 'bitch'),
 (0.04095, 'pop'),
 (0.03984, 'boy'),
 (0.03977, 'work'),
 (0.038399999999999997, 'shawty'),
 (0.037879999999999997, 'thong'),
 (0.033939999999999998, 'drop'),
 (0.032309999999999998, 'big'),
 (0.032059999999999998, 'ma'),
 (0.031789999999999999, 'lil'),
 (0.031579999999999997, 'hit'

1: you, the, it, and, me    2: you, 're, baby, love, me, can    3: br, lt, gt, you, baby, low,    4: na, da, it, baby, rock, hey, ya, like   5: na, the, and, my, of life

no stop words

1: love, don', know, baby    2: br, lt, gt, low, ya, hoo, flo, girl, lifetime    3: na, da, like, ya, got, shit, nigga, fuck, ass, club    4: na, love, baby, heart, away, way, life goodbye    5: love, baby, girl, ooh, way, shawty

In [187]:
#2010s
df_10s = df.loc[df['10s'] == True]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df_10s.Lyrics).toarray()

lsa = TruncatedSVD(5, algorithm = 'randomized')
tfidf_lsa = lsa.fit_transform(tfidf)
tfidf_lsa = Normalizer(copy=False).fit_transform(tfidf_lsa)

lsa.explained_variance_ratio_

array([ 0.00744164,  0.01404402,  0.01198148,  0.01084061,  0.01021922])

In [188]:
components = pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2","component_3","component_4","component_5"],columns = vectorizer.get_feature_names())
components[0:1]

Unnamed: 0,000,10,100,11,11th,12,13,13th,14,15,...,zipper,zippers,zod,zombie,zone,zoned,zoo,zoowap,zuchinis,zula
component_1,0.00449,0.00319,0.00424,0.00088,0.00077,0.00126,0.00076,6e-05,0.00136,0.00175,...,0.00039,0.00084,0.0004,0.0004,0.00811,0.00081,0.00103,0.00226,0.00025,0.00078


In [194]:
comp = []
for i in range(len(components.columns)):
    comp.append((components[4:5].values[0][i], components.columns[i]))
    
sorted(comp, key=lambda x: x[0], reverse=True)

[(0.57001000000000002, 'love'),
 (0.17150000000000001, 'baby'),
 (0.14044999999999999, 'heart'),
 (0.13605, 'need'),
 (0.12609000000000001, 'll'),
 (0.10055, 've'),
 (0.077030000000000001, 'deep'),
 (0.071290000000000006, 'hold'),
 (0.066830000000000001, 'way'),
 (0.061550000000000001, 'na'),
 (0.056649999999999999, 'hopeless'),
 (0.052810000000000003, 'gone'),
 (0.050520000000000002, 'ooh'),
 (0.048059999999999999, 'stay'),
 (0.046859999999999999, 'let'),
 (0.044769999999999997, 'inside'),
 (0.044630000000000003, 'won'),
 (0.043959999999999999, 'drug'),
 (0.043830000000000001, 'feel'),
 (0.043299999999999998, 'break'),
 (0.043249999999999997, 'broken'),
 (0.041689999999999998, 'dream'),
 (0.038510000000000003, 'darling'),
 (0.037569999999999999, 'away'),
 (0.036330000000000001, 'think'),
 (0.035860000000000003, 'die'),
 (0.035779999999999999, 'care'),
 (0.035610000000000003, 'learn'),
 (0.035110000000000002, 'scars'),
 (0.034250000000000003, 'loved'),
 (0.033270000000000001, 'place'),

1: you, the, it, and, me, to    2: you, love, baby, re, me, without, our, need ,know   3: oh, we, whoa, tonight, gonna, waiting  4: la, na, only, oh, ah, she her, tonight, knows    5: na, we, re, ready, your, our replay

no stop words

1: oh, like, love, don', know, baby    2: oh, whoa, tonight, waiting, gonna, woah, good, roar, feeling   3: la, na, world, ah, girl, knows, want   4: na, ready, replay, ceiling, ipod   5: love, baby, heart, need, 'll, ve, deep

## Top Phrases per Decade

In [284]:
from nltk.util import ngrams
from textblob import TextBlob

from collections import defaultdict
from operator import itemgetter

from nltk.corpus import stopwords
stop = stopwords.words('english')

counter = defaultdict(int)

n = 3
for doc in df_50s.Lyrics:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    for gram in bigrams:
        counter[gram] += 1
            
for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:30]:
    phrase = " ".join(gram)
    print('{}, {}'.format(phrase, count))

ah ah ah, 101
na na na, 82
'm gon na, 67
I 'm gon, 67
I ca n't, 49
I wan na, 49
dab um dab, 43
I know I, 41
la la la, 38
daba daba daba, 36
wan na dance, 28
um dab um, 28
Davy Davy Crockett, 27
Ah ah ah, 25
That 's I, 24
ca n't come, 24
Do n't know, 24
gon na get, 23
say I 'm, 23
're gon na, 23
I 've got, 22
I wo n't, 22
baby bye bye, 22
Come along party, 21
I need love, 21
along party doll, 21
And I 'll, 21
oh oh oh, 21
yip yip yip, 20
I 'd like, 20


In [285]:
counter = defaultdict(int)

n = 3
for doc in df_60s.Lyrics:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    for gram in bigrams:
        counter[gram] += 1
            
for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:30]:
    phrase = " ".join(gram)
    print('{}, {}'.format(phrase, count))

I ca n't, 255
la la la, 182
'm gon na, 175
I 'm gon, 174
I 've got, 124
yeah yeah yeah, 114
na na na, 112
I love I, 109
I wan na, 107
're gon na, 91
love I love, 78
I know I, 77
's gon na, 63
Do n't know, 58
You know I, 57
And I 'm, 56
I 'll never, 55
I n't know, 52
I ai n't, 50
And I 'll, 49
da da da, 47
oh oh oh, 45
I 'm love, 45
gon na make, 44
ca n't get, 44
gong gong gong, 44
I wo n't, 42
um um um, 42
n't know I, 38
I n't care, 37


In [289]:
counter = defaultdict(int)

n = 3
for doc in df_70s.Lyrics:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    for gram in bigrams:
        counter[gram] += 1
            
for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:30]:
    phrase = " ".join(gram)
    print('{}, {}'.format(phrase, count))

I ca n't, 338
la la la, 249
I wan na, 203
I 've got, 143
I 'm gon, 140
'm gon na, 139
're gon na, 106
I know I, 97
's gon na, 95
n't wan na, 72
Do n't know, 67
I n't know, 65
oh oh oh, 62
I wo n't, 62
na na na, 61
know I 'm, 60
I 'll never, 58
n't know I, 57
I ai n't, 52
I n't want, 51
La la la, 51
And I 'm, 51
I love I, 49
yeah yeah yeah, 49
I believe I, 48
I n't wan, 47
dit doo dit, 45
love I 'm, 45
You know I, 45
dance dance dance, 44


In [290]:
counter = defaultdict(int)

n = 3
for doc in df_80s.Lyrics:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    for gram in bigrams:
        counter[gram] += 1
            
for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:30]:
    phrase = " ".join(gram)
    print('{}, {}'.format(phrase, count))

I ca n't, 453
I wan na, 311
na na na, 242
oh oh oh, 226
'm gon na, 213
I 'm gon, 211
I 've got, 181
's gon na, 158
I know I, 148
're gon na, 141
n't wan na, 133
I n't know, 110
I n't want, 110
I ai n't, 106
I n't wan, 95
You know I, 95
I wo n't, 91
Do n't know, 90
I know 's, 88
ca n't go, 79
know I 'm, 69
Oh oh oh, 67
yeah yeah yeah, 65
I love I, 60
And I 'm, 60
ca n't get, 59
n't know I, 59
I think I, 59
gon na make, 57
And I n't, 56


In [291]:
counter = defaultdict(int)

n = 3
for doc in df_90s.Lyrics:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    for gram in bigrams:
        counter[gram] += 1
            
for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:30]:
    phrase = " ".join(gram)
    print('{}, {}'.format(phrase, count))

I wan na, 645
I ca n't, 528
I 'm gon, 237
n't wan na, 233
I n't know, 210
I know I, 204
'm gon na, 195
's gon na, 178
I wo n't, 164
I ai n't, 160
I n't wan, 152
na na na, 137
And I 'm, 133
know I 'm, 130
da da da, 128
n't know I, 118
I 've got, 117
I n't want, 117
You 've got, 111
're gon na, 110
gon na make, 103
la la la, 103
You know I, 102
gon na get, 99
yeah yeah yeah, 99
I got ta, 98
n't let go, 91
I think I, 83
I 'll give, 82
I wish I, 81


In [292]:
counter = defaultdict(int)

n = 3
for doc in df_00s.Lyrics:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    for gram in bigrams:
        counter[gram] += 1
            
for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:30]:
    phrase = " ".join(gram)
    print('{}, {}'.format(phrase, count))

lt br gt, 1222
I ca n't, 715
I wan na, 618
I ai n't, 356
n't wan na, 298
gt lt br, 296
br gt lt, 296
I 'm gon, 289
I n't know, 254
na na na, 252
'm gon na, 220
And I 'm, 211
I got ta, 209
la la la, 206
I I I, 196
I n't wan, 183
I know I, 160
oh oh oh, 153
da da da, 148
I wo n't, 144
yeah yeah yeah, 139
I n't want, 129
know I 'm, 125
And I n't, 121
n't know I, 121
I think I, 114
Flip Flip Flip, 114
You know I, 111
ai n't got, 111
's gon na, 108


In [293]:
counter = defaultdict(int)

n = 3
for doc in df_10s.Lyrics:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    for gram in bigrams:
        counter[gram] += 1
            
for gram, count in sorted(counter.items(), key = itemgetter(1), reverse=True)[:30]:
    phrase = " ".join(gram)
    print('{}, {}'.format(phrase, count))

I wan na, 385
I ca n't, 270
lt br gt, 200
I 'm gon, 194
'm gon na, 182
la la la, 181
oh oh oh, 171
na na na, 168
I ai n't, 158
And I 'm, 117
n't wan na, 116
wan na see, 110
I know I, 106
I n't know, 98
like I 'm, 92
I wo n't, 92
I 'm one, 87
I got ta, 86
know I 'm, 83
I 'm I, 76
I 've got, 75
Oh oh oh, 70
're gon na, 70
I n't wan, 67
'm I 'm, 62
You know I, 58
love love love, 56
And I know, 55
br gt lt, 52
gt lt br, 52


## The Prediction

In [359]:
pred = xgboostTarg.predict(pd.Series("""When I find myself in times of trouble
Mother Mary comes to me
Speaking words of wisdom, let it be
And in my hour of darkness
She is standing right in front of me
Speaking words of wisdom, let it be

Let it be, let it be
Let it be, let it be
Whisper words of wisdom, let it be

And when the broken-hearted people
Living in the world agree
There will be an answer, let it be
For though they may be parted
There is still a chance that they will see
There will be an answer, let it be

Let it be, let it be
Let it be, let it be
Yeah, there will be an answer, let it be
Let it be, let it be
Let it be, let it be
Whisper words of wisdom, let it be

Let it be, let it be
Ah, let it be, yeah, let it be
Whisper words of wisdom, let it be

And when the night is cloudy
There is still a light that shines on me
Shine on until tomorrow, let it be
I wake up to the sound of music,
Mother Mary comes to me
Speaking words of wisdom, let it be

Let it be, let it be
Let it be, yeah, let it be
Oh, there will be an answer, let it be
Let it be, let it be
Let it be, yeah, let it be
Whisper words of wisdom, let it be

"""))


if pred[0] == 0:
    print('1950s')
elif pred[0] == 1:
    print('1960s')
elif pred[0] == 2:
    print('1970s')
elif pred[0] == 3:
    print('1980s')
elif pred[0] == 4:
    print('1990s')
elif pred[0] == 5:
    print('2000s')
elif pred[0] == 6:
    print('2010s')

1970s
