In [12]:
# Import dependencies
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [13]:
# Create DataFrame from CSV file
music_df = pd.read_csv('nlp_df.csv')
music_df.head()

Unnamed: 0.1,Unnamed: 0,song,artist,category,lyrics,words,filtered,hashedValues,features
0,0,Monster (Shawn Mendes & Justin Bieber),Shawn Mendes,34,You put me on a pedestal and tell me Im the b...,"['', 'you', 'put', 'me', 'on', 'a', 'pedestal'...","['', 'put', 'pedestal', 'tell', 'im', 'best', ...","(262144,[3386,3924,18184,21823,30548,31015,375...","(262144,[3386,3924,18184,21823,30548,31015,375..."
1,1,positions,Ariana Grande,34,Heaven sent you to me Im just hopin I dont re...,"['', 'heaven', 'sent', 'you', 'to', 'me', 'im'...","['', 'heaven', 'sent', 'im', 'hopin', 'dont', ...","(262144,[7231,21823,31015,39504,51471,55334,64...","(262144,[7231,21823,31015,39504,51471,55334,64..."
2,2,Therefore I Am,Billie Eilish,34,Im not your friend Or anything damn You think...,"['', 'im', 'not', 'your', 'friend', 'or', 'any...","['', 'im', 'friend', 'anything', 'damn', 'thin...","(262144,[18176,22346,23071,23087,24175,31015,3...","(262144,[18176,22346,23071,23087,24175,31015,3..."
3,3,Levitating (feat. DaBaby),Dua Lipa,34,Billboard Baby Dua Lipa los hace bailar cuand...,"['', 'billboard', 'baby', 'dua', 'lipa', 'los'...","['', 'billboard', 'baby', 'dua', 'lipa', 'los'...","(262144,[522,764,1212,2298,7188,7913,7975,8884...","(262144,[522,764,1212,2298,7188,7913,7975,8884..."
4,4,Dakiti,Bad Bunny,34,Baby ya yo me enteré se nota cuando me ve Ahí...,"['', 'baby', 'ya', 'yo', 'me', 'enteré', 'se',...","['', 'baby', 'ya', 'yo', 'enteré', 'se', 'nota...","(262144,[764,1781,2409,3165,5555,6523,9192,125...","(262144,[764,1781,2409,3165,5555,6523,9192,125..."


In [34]:
# Try to remove songs with spanish lyrics
for index, row in music_df.iterrows():
    if ('é' in row['lyrics']) or ('í' in row['lyrics']):
        music_df = music_df.drop(index)
music_df.head()

Unnamed: 0.1,Unnamed: 0,song,artist,category,lyrics,words,filtered,hashedValues,features
0,0,Monster (Shawn Mendes & Justin Bieber),Shawn Mendes,34,You put me on a pedestal and tell me Im the b...,"['', 'you', 'put', 'me', 'on', 'a', 'pedestal'...","['', 'put', 'pedestal', 'tell', 'im', 'best', ...","(262144,[3386,3924,18184,21823,30548,31015,375...","(262144,[3386,3924,18184,21823,30548,31015,375..."
1,1,positions,Ariana Grande,34,Heaven sent you to me Im just hopin I dont re...,"['', 'heaven', 'sent', 'you', 'to', 'me', 'im'...","['', 'heaven', 'sent', 'im', 'hopin', 'dont', ...","(262144,[7231,21823,31015,39504,51471,55334,64...","(262144,[7231,21823,31015,39504,51471,55334,64..."
2,2,Therefore I Am,Billie Eilish,34,Im not your friend Or anything damn You think...,"['', 'im', 'not', 'your', 'friend', 'or', 'any...","['', 'im', 'friend', 'anything', 'damn', 'thin...","(262144,[18176,22346,23071,23087,24175,31015,3...","(262144,[18176,22346,23071,23087,24175,31015,3..."
5,5,Errbody,Lil Baby,34,Flyer than everybody Section 8 just straight ...,"['', 'flyer', 'than', 'everybody', 'section', ...","['', 'flyer', 'everybody', 'section', '8', 'st...","(262144,[1303,1398,1745,3280,4372,4978,6715,82...","(262144,[1303,1398,1745,3280,4372,4978,6715,82..."
6,6,Whoopty,CJ,34,Loyalty over royalty yall niggas know the vib...,"['', 'loyalty', 'over', 'royalty', 'yall', 'ni...","['', 'loyalty', 'royalty', 'yall', 'niggas', '...","(262144,[1689,3186,6116,6524,8538,10345,11209,...","(262144,[1689,3186,6116,6524,8538,10345,11209,..."


In [45]:
# Create a list of words 
words_list = []
for index, row in music_df.iterrows():
    filtered_words = music_df['filtered'][index]
    filtered_words = filtered_words.replace(',', '').replace("'", '')
    filtered_words = filtered_words.replace('[', '').replace(']', '')
    unique_words = list(set(filtered_words.strip().split(' ')))
    words_list.extend(unique_words)
word_columns = list(set(words_list))
len(word_columns)

89978

In [47]:
# Create DataFrame for ML model
columns = ['song', 'artist']
columns.extend(word_columns)
columns.append('category')
song_words_df = pd.DataFrame(columns=columns)
for index, row in music_df.iterrows():
    song_words_df.append(row)
    filtered_words = music_df['filtered'][index]

song_words_df.head()

Unnamed: 0,song,artist,Unnamed: 3,bison,incedebant.,ال,emurgency,signification—,lakers,finges,...,spicily,curses,girdi,sweating,barth,sobol,jun...ill,allswill,lusts,category


In [14]:
music_df['filtered'][0]

"['', 'put', 'pedestal', 'tell', 'im', 'best', 'raise', 'sky', 'im', 'short', 'breath', 'yeah', 'fill', 'confidence', 'say', 'whats', 'chest', 'spill', 'words', 'tear', 'theres', 'nothin', 'left', 'rearrange', 'pieces', 'fit', 'rest', 'yeah', 'trip', 'fall', 'monster', 'let', 'know', 'sin', 'break', 'yeah', 'monster', 'yeah', 'let', 'know', 'yeah', 'fifteen', 'world', 'put', 'pedestal', 'big', 'dreams', 'doin', 'shows', 'making', 'memories', 'made', 'bad', 'moves', 'tryin', 'act', 'cool', 'upset', 'jealousy', 'uh', 'liftin', 'liftin', 'liftin', 'yeah', 'tearin', 'tearin', 'yeah', 'ill', 'take', 'responsibility', 'everything', 'ive', 'done', 'yeah', 'holdin', 'like', 'youre', 'holy', 'one', 'yeah', 'chip', 'shoulder', 'let', 'go', 'cause', 'unforgiveness', 'keeps', 'control', 'came', 'good', 'intentions', 'let', 'go', 'really', 'wanna', 'know', 'trip', 'oh', 'fall', 'fall', 'monster', 'monster', 'let', 'know', 'let', 'know', 'sin', 'oh', 'break', 'yeah', 'break', 'monster', 'monster', '

In [10]:
music_df['hashedValues'][0]

'(262144,[3386,3924,18184,21823,30548,31015,37521,48531,55639,56998,60080,62058,65298,65844,68847,71619,72597,73199,77751,77772,81662,82065,85530,87273,87623,90468,97280,113299,113432,123940,124348,126368,140286,140762,140784,140931,145126,145380,148081,148675,150152,151864,152982,155889,157377,159534,160334,165688,166027,166368,167440,168828,170414,171222,172247,172888,173339,175446,176260,176996,184868,188565,191497,197126,200147,203458,208258,208792,209749,218192,223619,229264,231139,233391,233762,234620,235248,237465,241651,248069,248097,249180,258731],[4.0,1.0,4.0,1.0,3.0,2.0,6.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,11.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,18.0,8.0,1.0,7.0,1.0,1.0,1.0,2.0,1.0,1.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,6.0,1.0,1.0,1.0,1.0,1.0,10.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])'

In [16]:
filtered_words = music_df['filtered'][0]
filtered_words = filtered_words.replace(',', '').replace("'", '')
filtered_words = filtered_words.replace('[', '').replace(']', '')
filtered_words

' put pedestal tell im best raise sky im short breath yeah fill confidence say whats chest spill words tear theres nothin left rearrange pieces fit rest yeah trip fall monster let know sin break yeah monster yeah let know yeah fifteen world put pedestal big dreams doin shows making memories made bad moves tryin act cool upset jealousy uh liftin liftin liftin yeah tearin tearin yeah ill take responsibility everything ive done yeah holdin like youre holy one yeah chip shoulder let go cause unforgiveness keeps control came good intentions let go really wanna know trip oh fall fall monster monster let know let know sin oh break yeah break monster monster let know oh please let know yeah la da da duh duh baby fall la da da duh duh duh na la da da da duh duh la da da duh duh duh na la da da duh duh please dont let fall la da da duh duh duh na la da da da duh duh oh please dont let fall la da da duh duh duh na'

In [28]:
unique_words = list(set(filtered_words.strip().split(' ')))
words_list = list(filtered_words.strip().split(' '))
print(f'There are {len(unique_words)} unique words.')
print(f'There are {len(words_list)} words.')

There are 82 unique words.
There are 180 words.


In [None]:
# Put the second element of the hashedValues column into rows of a new DataFrame

In [None]:
music_df['hashedValues'][0]

In [11]:
music_df.dtypes

Unnamed: 0       int64
song            object
artist          object
category        object
lyrics          object
words           object
filtered        object
hashedValues    object
features        object
dtype: object

In [None]:
# Transform the filtered column into an array and make columns with term fequencies

In [None]:
# Make the song name be the index of the DataFrame