In [34]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest

In [35]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
lyrics = pd.read_csv('lyrics.csv', index_col=0)
lyrics.head(5)

Unnamed: 0_level_0,song,lyrics
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Zella Day,Sweet Ophelia,"Believe me now, you're too young, girl Cherry ..."
Gerard Way,Hazy Shade of Winter (feat. Ray Toro),"Time, time, time See what's become of me Tim..."
Five Finger Death Punch,Stuck in My Ways,All the shades of gray that loom inside me No ...
Five Finger Death Punch,Cradle to the Grave,The blood that runs within my veins (The blood...
Five Finger Death Punch,Coming Down,It's caving in around me What I thought was so...


In [37]:
#set lyrics column with "No Lyrics" to null
lyrics.loc[lyrics['lyrics'] == 'No Lyrics', 'lyrics'] = np.nan
lyrics.dropna(subset=['lyrics'], inplace=True)

In [38]:
#drop any symbols that are not letters or spaces from lyrics
# lyrics['lyrics'] = lyrics['lyrics'].str.replace('[^a-zA-Z ]', '')

In [41]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',stop_words= stopwords.words('english'))

In [42]:
tfidf_vector = tfidf_vectorizer.fit_transform(lyrics['lyrics'])
len(tfidf_vectorizer.get_feature_names_out())

85189

In [43]:
tfidf_vectorizer.get_feature_names_out()[:10]

array(['__', '_____', '_______________________________',
       '_______________________________________', '_a', '_accouchement',
       '_ad', '_alles', '_behold', '_benedicat'], dtype=object)

In [44]:
def top_tfidf_feats(row, features, top_n=25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [45]:
lyrics.iloc[0]['lyrics']

"Believe me now, you're too young, girl Cherry pie with your gold curls Growin' up like a grapevine Wrapped around you in due time   Oh, oh Oh, oh   Sweet Ophelia, when young blood escapes Vows that break go up, up away Sweet Ophelia, when young blood escapes Vows that break go up, up away   Up, up away Up, up away   Singing like it's a full moon (Singing like it's a full moon) Careless now that he has you (Careless now that he has you) Turns you on to the right songs (Turns you on to the right songs) Promises that you're hooked on (Promises that you're hooked on) You might also like Oh, oh Oh, oh   Sweet Ophelia, when young blood escapes Vows that break go up, up away Sweet Ophelia, when young blood escapes Vows that break go up, up away   Up, up away Up, up away   Sweet Ophelia, when young blood escapes Vows that break go up, up away Sweet Ophelia, when young blood escapes Vows that break go up, up away Sweet Ophelia, when young blood escapes Vows that break go up, up away   Up, up a

In [46]:
top_feats_in_doc(tfidf_vector,tfidf_vectorizer.get_feature_names_out(),0,10)

Unnamed: 0,feature,tfidf
0,ophelia,0.495746
1,vows,0.401881
2,escapes,0.392721
3,away,0.327761
4,young,0.255409
5,sweet,0.226273
6,blood,0.222879
7,break,0.203568
8,oh,0.137389
9,go,0.12544


In [47]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [49]:
top_mean_feats(tfidf_vector,tfidf_vectorizer.get_feature_names_out(),top_n=50)

Unnamed: 0,feature,tfidf
0,oh,0.03648
1,ooh,0.021461
2,love,0.020695
3,yeah,0.01705
4,wanna,0.014784
5,know,0.014386
6,la,0.013679
7,go,0.011648
8,let,0.011265
9,ah,0.009949
