### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

### Reading Data

Source: https://www.kaggle.com/rakannimer/billboard-lyrics

In [2]:
lyrics = pd.read_csv('Data/billboard-1964-2015-songs-lyrics/billboard_lyrics_1964-2015_utf-8.csv')

In [3]:
lyrics.columns = ['rank','song','artist','year','lyrics','source']

### A Quick Look At Data

In [4]:
lyrics.head()

Unnamed: 0,rank,song,artist,year,lyrics,source
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0


In [5]:
lyrics.tail()

Unnamed: 0,rank,song,artist,year,lyrics,source
5095,96,el perdon,nicky jam and enrique iglesias,2015,enrique iglesias dime si es verdad me dijeron ...,3.0
5096,97,she knows,neyo featuring juicy j,2015,,
5097,98,night changes,one direction,2015,going out tonight changes into something red ...,1.0
5098,99,back to back,drake,2015,oh man oh man oh man not againyeah i learned ...,1.0
5099,100,how deep is your love,calvin harris and disciples,2015,i want you to breathe me in let me be your ai...,1.0


### A Sample Lyric

In [6]:
lyrics['lyrics'][0]

'sam the sham miscellaneous wooly bully wooly bully sam the sham  the pharaohs  domingo samudio uno dos one two tres quatro matty told hatty about a thing she saw had two big horns and a wooly jaw wooly bully wooly bully wooly bully wooly bully wooly bully hatty told matty lets dont take no chance lets not belseven come and learn to dance wooly bully wooly bully wooly bully wooly bully wooly bully matty told hatty thats the thing to do get you someone really to pull the wool with you wooly bully wooly bully wooly bully wooly bully wooly bully lseven  the letter l and the number 7 when typed they form a rough square l7 so the lyrics mean lets not be square'

### A Look At the "year" Column

In [7]:
year_dist = pd.DataFrame(lyrics.year.value_counts())
year_dist.columns = ['Count']
year_dist['Percentage'] = lyrics.year.value_counts(normalize=True)
year_dist

Unnamed: 0,Count,Percentage
2015,100,0.019608
1965,100,0.019608
2005,100,0.019608
2001,100,0.019608
1997,100,0.019608
1993,100,0.019608
1989,100,0.019608
1985,100,0.019608
1981,100,0.019608
1977,100,0.019608


<div class="alert alert-success">
Potential Problem with this Dataset: Good Distribution of Year
</div>

### Remove Invalid "year"

### Look At the "genre" Column

In [8]:
genre_dist = pd.DataFrame(lyrics.genre.value_counts())
genre_dist.columns = ['Count']
genre_dist['Percentage'] = lyrics.genre.value_counts(normalize=True)
genre_dist

AttributeError: 'DataFrame' object has no attribute 'genre'

<div class="alert alert-warning">
Potentail Problem with this Dataset: Need to Collect the Genres of these songs
</div>

### Number of Unique Artist

In [9]:
artist_dist = pd.DataFrame(lyrics.artist.value_counts())
artist_dist.columns = ['Count']
artist_dist['Percentage'] = lyrics.artist.value_counts(normalize=True)
artist_dist

Unnamed: 0,Count,Percentage
madonna,35,0.006863
elton john,26,0.005098
mariah carey,25,0.004902
michael jackson,22,0.004314
janet jackson,22,0.004314
stevie wonder,22,0.004314
whitney houston,19,0.003725
taylor swift,19,0.003725
rihanna,19,0.003725
the beatles,17,0.003333


In [10]:
len(lyrics.artist.value_counts())

2473

In [12]:
artist_dist.index

Index(['madonna', 'elton john', 'mariah carey', 'michael jackson',
       'janet jackson', 'stevie wonder', 'whitney houston', 'taylor swift',
       'rihanna', 'the beatles',
       ...
       'fat joe featuring nelly', 'house of pain', 'zedd featuring foxes',
       'ace frehley', '2 chainz', 'icona pop featuring charli xcx',
       'samantha mumba', 'phil collins and marilyn martin',
       'the supremes  the temptations', 'snoop dogg featuring r kelly'],
      dtype='object', length=2473)

### Checking Number of Null Values

In [13]:
lyrics.isnull().sum()

rank        0
song        0
artist      0
year        0
lyrics    187
source    187
dtype: int64

### Remove Rows With Missing Values

In [14]:
lyrics.dropna(inplace=True)

In [15]:
lyrics.shape

(4913, 6)

### Plot Distribution of Year

### Look for Specific Artist

In [16]:
lyrics.loc[lyrics.artist=='taylor swift',:]

Unnamed: 0,rank,song,artist,year,lyrics,source
4288,89,teardrops on my guitar,taylor swift,2007,drew looks at me i fake a smile so he wont se...,1.0
4340,41,our song,taylor swift,2008,i was riding shotgun with my hair undone in t...,1.0
4347,48,teardrops on my guitar,taylor swift,2008,drew looks at me i fake a smile so he wont se...,1.0
4380,81,love story,taylor swift,2008,we were both young when i first saw you i clo...,1.0
4404,5,love story,taylor swift,2009,we were both young when i first saw you i clo...,1.0
4410,11,you belong with me,taylor swift,2009,youre on the phone with your girlfriend shes ...,1.0
4475,76,white horse,taylor swift,2009,say youre sorry that face of an angel comes o...,1.0
4545,46,mine,taylor swift,2010,oh oh oh oh oh ohyou were in college working ...,1.0
4556,57,you belong with me,taylor swift,2010,youre on the phone with your girlfriend shes ...,1.0
4583,84,today was a fairytale,taylor swift,2010,today was a fairytale you were the prince i u...,1.0


### Vectorizing the Lyrics

In [17]:
len(list(lyrics['lyrics']))

4913

In [81]:
lyrics_data = list(lyrics['lyrics'])
count_vect = CountVectorizer(token_pattern=r'\b\w+\b')
lyrics_mat = count_vect.fit_transform(lyrics_data)
lyrics_mat.toarray().shape

(4913, 42160)

In [82]:
count_vect.vocabulary_

{'sam': 31008,
 'the': 36071,
 'sham': 32028,
 'miscellaneous': 23535,
 'wooly': 40718,
 'bully': 5540,
 'pharaohs': 27610,
 'domingo': 10379,
 'samudio': 31066,
 'uno': 38506,
 'dos': 10557,
 'one': 26106,
 'two': 38099,
 'tres': 37645,
 'quatro': 28953,
 'matty': 22474,
 'told': 37062,
 'hatty': 16339,
 'about': 437,
 'a': 365,
 'thing': 36247,
 'she': 32127,
 'saw': 31183,
 'had': 15968,
 'big': 4199,
 'horns': 17561,
 'and': 1597,
 'jaw': 19125,
 'lets': 20627,
 'dont': 10440,
 'take': 35550,
 'no': 25093,
 'chance': 6503,
 'not': 25273,
 'belseven': 3953,
 'come': 7520,
 'learn': 20484,
 'to': 36912,
 'dance': 8845,
 'thats': 36045,
 'do': 10203,
 'get': 14582,
 'you': 41617,
 'someone': 33460,
 'really': 29367,
 'pull': 28803,
 'wool': 40716,
 'with': 40532,
 'lseven': 21831,
 'letter': 20628,
 'l': 20099,
 'number': 25516,
 '7': 295,
 'when': 40004,
 'typed': 38145,
 'they': 36222,
 'form': 13773,
 'rough': 30613,
 'square': 34042,
 'l7': 20100,
 'so': 33311,
 'lyrics': 21945,
 

In [83]:
count_vect.inverse_transform(lyrics_mat[0])

[array(['be', 'mean', 'lyrics', 'so', 'l7', 'square', 'rough', 'form',
        'they', 'typed', 'when', '7', 'number', 'l', 'letter', 'lseven',
        'with', 'wool', 'pull', 'really', 'someone', 'you', 'get', 'do',
        'thats', 'dance', 'to', 'learn', 'come', 'belseven', 'not',
        'chance', 'no', 'take', 'dont', 'lets', 'jaw', 'and', 'horns',
        'big', 'had', 'saw', 'she', 'thing', 'a', 'about', 'hatty', 'told',
        'matty', 'quatro', 'tres', 'two', 'one', 'dos', 'uno', 'samudio',
        'domingo', 'pharaohs', 'bully', 'wooly', 'miscellaneous', 'sham',
        'the', 'sam'], dtype='<U46')]

#### find top k words

below cell go over the entire matrix (~5000*40000 ~200 millions operations), takes ~1 minutes

In [140]:
word_count = dict()
for i in range(lyrics_mat.toarray().shape[0]):
    curr = list(lyrics_mat[i].toarray()[0])
    for wi in range(len(curr)):
        if curr[wi] > 0:
            if wi not in word_count:
                word_count[wi] = curr[wi]
            else:
                word_count[wi] += curr[wi]

In [144]:
k = 10

keys_list = list(word_count.keys())
counts_list = list(word_count.values())
argsorted_count = np.argsort(counts_list)

top_k_word = []
top_k_count = []
bot_k_word = []
bot_k_count = []

for i in range(k):
    bot_k_word.append(count_vect.get_feature_names()[keys_list[argsorted_count[i]]])
    bot_k_count.append(counts_list[argsorted_count[i]])
for i in range(1,k+1):
    top_k_word.append(count_vect.get_feature_names()[keys_list[argsorted_count[-i]]])
    top_k_count.append(counts_list[argsorted_count[-i]])

print("Top\tWord\tCount")
for i in range(k):
    print(str(i+1)+"\t"+top_k_word[i]+"\t\t"+str(top_k_count[i]))
    
print("Bottom\tWord\t\tCount")
for i in range(k):
    print(str(i+1)+"\t"+bot_k_word[i]+"\t"+str(bot_k_count[i]))

Top	Word	Count
1	you		64606
2	i		56514
3	the		53451
4	to		35752
5	and		32555
6	me		31170
7	a		29282
8	it		25688
9	my		22821
10	in		18553
Bottom	Word		Count
1	thereacross	1
2	sleepsjust	1
3	sleepsit	1
4	sleepsi	1
5	cleanit	1
6	cleanill	1
7	wellmm	1
8	selfcentered	1
9	seea	1
10	seagram	1


### Apply Tf-Idf

source: https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction


source: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer

The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

The formula that is used to compute the tf-idf for a term t of a document d in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [ n / df(t) ] + 1 (if smooth_idf=False), where n is the total number of documents in the document set and df(t) is the document frequency of t; the document frequency is the number of documents in the document set that contain the term t. The effect of adding “1” to the idf in the equation above is that terms with zero idf, i.e., terms that occur in all documents in a training set, will not be entirely ignored. (Note that the idf formula above differs from the standard textbook notation that defines the idf as idf(t) = log [ n / (df(t) + 1) ]).

In [131]:
transformer = TfidfTransformer()
tfidf_mat = transformer.fit_transform(lyrics_mat)

In [132]:
transformer.idf_

array([8.80669637, 8.80669637, 8.80669637, ..., 8.80669637, 6.4088011 ,
       8.11354919])

Note: idf - the higher the more useful the word is (less frequent in other document)

#### find top k words

below cell go over the entire matrix (~5000*40000 ~200 millions operations), takes ~1 minutes

In [145]:
tfidf_count = dict()
for i in range(tfidf_mat.toarray().shape[0]):
    curr = list(tfidf_mat[i].toarray()[0])
    for wi in range(len(curr)):
        if curr[wi] > 0:
            if wi not in tfidf_count:
                tfidf_count[wi] = curr[wi]
            else:
                tfidf_count[wi] += curr[wi]

In [146]:
k = 10

keys_list = list(tfidf_count.keys())
counts_list = list(tfidf_count.values())
argsorted_count = np.argsort(counts_list)

top_k_word = []
top_k_count = []
bot_k_word = []
bot_k_count = []

for i in range(k):
    bot_k_word.append(count_vect.get_feature_names()[keys_list[argsorted_count[i]]])
    bot_k_count.append(counts_list[argsorted_count[i]])
for i in range(1,k+1):
    top_k_word.append(count_vect.get_feature_names()[keys_list[argsorted_count[-i]]])
    top_k_count.append(counts_list[argsorted_count[-i]])

print("Top\tWord\tCount")
for i in range(k):
    print(str(i+1)+"\t"+top_k_word[i]+"\t"+str(top_k_count[i]))
    
print("Bottom\tWord\t\tCount")
for i in range(k):
    print(str(i+1)+"\t"+bot_k_word[i]+"\t\t"+str(bot_k_count[i]))

Top	Word	Count
1	you	544.550865358322
2	i	485.692936922662
3	the	411.68270345703814
4	to	298.62608717206115
5	me	282.03058257911346
6	and	272.4908298336047
7	a	243.1203736606626
8	it	221.47677805701213
9	my	218.8723947203573
10	love	201.53706475639328
Bottom	Word		Count
1	wre		0.006070876886015375
2	thoingwe		0.006070876886015375
3	thoingto		0.006070876886015375
4	thoingnow		0.006070876886015375
5	thoiathoiathoing		0.006070876886015375
6	crys		0.006070876886015375
7	clubnow		0.006070876886015375
8	sum		0.006070876886015375
9	godly		0.008965238853695262
10	hollerinay		0.008965238853695262


## Testing things out, ignore below

Using TfidfVectorizer()

source: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer.get_stop_words

In [28]:
tfidf_vect = TfidfVectorizer()
tfidf_mat = tfidf_vect.fit_transform(lyrics_data)
tfidf_mat.toarray().shape

(4913, 42123)

In [33]:
len(tfidf_vect.idf_)

42123

In [34]:
len(tfidf_vect.vocabulary_)

42123

In [44]:
len(tfidf_vect.get_feature_names())

42123

now we know TfidfVectorizer.idf_ maps to either TfidfVectorizer.vocabulary_ or TfidfVectorizer.get_feature_names(), need to find out which one so we can get the top k / bottom k words with highest/lowest idf

In [98]:
test_vect = CountVectorizer()
corpus = ["the the the", "the an an", "the an important"]
test_vect.fit_transform(corpus)
# test_vect.idf_

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [99]:
test_vect.get_feature_names()

['an', 'important', 'the']

In [100]:
test_vect.vocabulary_

{'an': 0, 'important': 1, 'the': 2}

In [116]:
test_vect.transform(["the the the"]).toarray()

array([[0, 0, 3]], dtype=int64)

we now know the TfidfVectorizer.idf_ is mapped to TfidfVectorizer.get_feature_names()

TfidfVectorizer.vocabulary_ is a mapping of terms to feature indices instead of a mapping of terms to term frequencies, I misunderstood

now wanna figure out what does preprocessor, analyzer and tokenizer of CountVectorizer do 

In [77]:
test_vect = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b')
corpus = ["the the the", "the an an", "the an important"]
preprocessor = test_vect.build_preprocessor()
tokenizer = test_vect.build_tokenizer()
analyzer = test_vect.build_analyzer()

preprocessor("  the an  important  !@#@! 123 1 ! .? ")

'  the an  important  !@#@! 123 1 ! .? '

In [78]:
tokenizer("  the an a important  !@#@! 123 1 ! .? ")

['the', 'an', 'a', 'important', '123', '1']

In [79]:
analyzer("  the an a important  !@#@! 123 1 ! .? ")

['the',
 'an',
 'a',
 'important',
 '123',
 '1',
 'the an',
 'an a',
 'a important',
 'important 123',
 '123 1']