In [185]:
# Import dependencies
import pandas as pd
import numpy as np
import time
import re

In [186]:
# Create DataFrame from CSV file
nlp_df = pd.read_csv('../../Data/nlp_df.csv')
print(len(nlp_df))
nlp_df.head(3)

8350


Unnamed: 0,song,song_id,artist,artist_id,category,category_id,popularity,genres,audio_ft_danceability,audio_ft_energy,...,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,audio_ft_duration_ms,audio_ft_time_signature,lyrics,non_alpha_words,filtered,language
0,willow,0lx2cLdOt3piJbcaXIV74f,Taylor Swift,06HL4z0CvFAxyc27GXpf02,pop,8,93,"['dance', 'pop']",0.392,0.574,...,0.00179,0.145,0.529,81.112,214707.0,4.0,Im like the water when your ship rolled in th...,1,"['', 'im', 'like', 'water', 'ship', 'rolled', ...","(1, 'en')"
1,Stay Next To Me (with Chelsea Cutler),6SGG5AxHShqSYiV9fCWpZz,Quinn XCII,3ApUX1o6oSz321MMECyIYd,pop,8,78,"['indie', 'pop', 'electropop']",0.581,0.584,...,0.0,0.366,0.756,179.954,206046.0,4.0,Didnt even wanna go out whyd you call me ? Iv...,1,"['', 'didnt', 'even', 'wanna', 'go', 'whyd', '...","(1, 'en')"
2,WITHOUT YOU,27OeeYzk6klgBh83TSvGMA,The Kid LAROI,2tIP7SsRs7vjIcLrU85W8J,pop,8,95,['australian'],0.662,0.413,...,0.0,0.134,0.467,93.005,161385.0,4.0,You cut out a piece of me and now I bleed int...,1,"['', 'cut', 'piece', 'bleed', 'internally', 'l...","(1, 'en')"


In [187]:
# Create a list of all words, unique word counts, and filtered words
words_list = []
unique_word_counts = []
filtered_words_list = []
for index, row in nlp_df.iterrows():
    filtered_words = row['filtered']
    filtered_words = filtered_words.replace(',', '').replace("'", '')
    filtered_words = filtered_words.replace('[', '').replace(']', '')
    filtered_words = filtered_words.replace('#', '').replace('&nbsp', '')
    filtered_words = filtered_words.replace('?', '? ').replace(',', '')
    filtered_words = filtered_words.replace('/', ' ')
    filtered_words = filtered_words.replace('\\u200a', '').replace('\\u200b', '')
    filtered_words = filtered_words.replace('\\u2063', '').replace('\u202f', '')
    filtered_words = filtered_words.replace('\\u2028', ' ').replace('\\u2008', ' ')
    while ('\\u200e' in filtered_words) or ('\\xa0' in filtered_words):
        filtered_words = filtered_words.replace('\\u200e', '')
        filtered_words = filtered_words.replace('\\xa0', '')
    filtered_words_list.append(filtered_words)
    unique_words = list(set(filtered_words.strip().split(' ')))
    unique_word_counts.append(len(unique_words))
    words_list.extend(unique_words)
word_columns = list(set(words_list))
len(word_columns)

36234

In [188]:
# Add a unique_word_count column and replace the filtered column
nlp_df['unique_word_count'] = unique_word_counts
nlp_df['filtered'] = filtered_words_list
nlp_df.head(1)

Unnamed: 0,song,song_id,artist,artist_id,category,category_id,popularity,genres,audio_ft_danceability,audio_ft_energy,...,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,audio_ft_duration_ms,audio_ft_time_signature,lyrics,non_alpha_words,filtered,language,unique_word_count
0,willow,0lx2cLdOt3piJbcaXIV74f,Taylor Swift,06HL4z0CvFAxyc27GXpf02,pop,8,93,"['dance', 'pop']",0.392,0.574,...,0.145,0.529,81.112,214707.0,4.0,Im like the water when your ship rolled in th...,1,im like water ship rolled night rough surface...,"(1, 'en')",87


In [189]:
# Remove songs with less than 25 unique words
count = len(nlp_df[nlp_df['unique_word_count']<25])
nlp_df = nlp_df[nlp_df['unique_word_count']>=25]
print(f'There were {count} songs with less than 25 unique words.')
len(nlp_df)

There were 309 songs with less than 25 unique words.


8041

In [190]:
# Update word list
words_list = []
for index, row in nlp_df.iterrows():
    filtered_words = row['filtered']
    unique_words = list(set(filtered_words.strip().split(' ')))
    words_list.extend(unique_words)
word_columns = list(set(words_list))
len(word_columns)

36113

In [191]:
# Remove non alphabetic words from the word columns
non_alpha_words = []
for word in word_columns:
    if not re.match(r'[a-zA-Z0-9?!/$&+]+$', word):
        non_alpha_words.append(word)
        word_columns.remove(word)
print(len(non_alpha_words))
len(word_columns)

887


35226

# Create new DataFrame for ML model

In [192]:
# Create DataFrame for the ML model
t0 = time.time()
columns = ['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list', 
           'audio_ft_danceability', 'audio_ft_energy', 'audio_ft_key', 
           'audio_ft_loudness', 'audio_ft_mode', 'audio_ft_speechiness', 
           'audio_ft_acousticness', 'audio_ft_instrumentalness', 'audio_ft_liveness',
           'audio_ft_valence', 'audio_ft_tempo', 'audio_ft_duration_ms', 
           'audio_ft_time_signature']
columns.extend(word_columns)
lyric_TF_df = pd.DataFrame(columns=columns)
lyric_TF_df['song_name'] = nlp_df['song']
lyric_TF_df['artist_name'] = nlp_df['artist']
lyric_TF_df['category_name'] = nlp_df['category']
lyric_TF_df['category_id'] = nlp_df['category_id']
lyric_TF_df['genre_list'] = nlp_df['genres']
lyric_TF_df['audio_ft_danceability'] = nlp_df['audio_ft_danceability']
lyric_TF_df['audio_ft_energy'] = nlp_df['audio_ft_energy']
lyric_TF_df['audio_ft_key'] = nlp_df['audio_ft_key']
lyric_TF_df['audio_ft_loudness'] = nlp_df['audio_ft_loudness']
lyric_TF_df['audio_ft_mode'] = nlp_df['audio_ft_mode']
lyric_TF_df['audio_ft_speechiness'] = nlp_df['audio_ft_speechiness']
lyric_TF_df['audio_ft_acousticness'] = nlp_df['audio_ft_acousticness']
lyric_TF_df['audio_ft_instrumentalness'] = nlp_df['audio_ft_instrumentalness']
lyric_TF_df['audio_ft_liveness'] = nlp_df['audio_ft_liveness']
lyric_TF_df['audio_ft_valence'] = nlp_df['audio_ft_valence']
lyric_TF_df['audio_ft_tempo'] = nlp_df['audio_ft_tempo']
lyric_TF_df['audio_ft_duration_ms'] = nlp_df['audio_ft_duration_ms']
lyric_TF_df['audio_ft_time_signature'] = nlp_df['audio_ft_time_signature']
lyric_TF_df = lyric_TF_df.fillna(0)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
lyric_TF_df.head(3)

Run time: 128.54848670959473 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_loudness,audio_ft_mode,...,tiananmen,seagrams,ragin,bitter,staff,wellve,gunners,eighth,flushing,formby
0,willow,Taylor Swift,pop,8,"['dance', 'pop']",0.392,0.574,7.0,-9.195,1.0,...,0,0,0,0,0,0,0,0,0,0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,-4.928,1.0,...,0,0,0,0,0,0,0,0,0,0
2,WITHOUT YOU,The Kid LAROI,pop,8,['australian'],0.662,0.413,0.0,-7.357,1.0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
# Add the term frequencies to the DataFrame
t0 = time.time()
t2 = time.time()
exceptions_list = []
count = 0
for index, row in lyric_TF_df.iterrows():
    for word in nlp_df['filtered'][index].strip().split(' '):
        if word not in non_alpha_words:
            try:
                lyric_TF_df.loc[index, word]+=1
            except(Exception) as e:
                if len(word) > 1:
                    exceptions_list.append(word)
    if (index+1) % 500 == 0:
        print(f'Run time for words {count*500}-{(count+1)*500}: {time.time()-t2} seconds')
        count+=1
        t2 = time.time()
t1 = time.time()
print(f'Total run time: {t1-t0} seconds')
print(len(exceptions_list))
lyric_TF_df.head(3)

Run time for words 0-500: 48.70390295982361 seconds
Run time for words 500-1000: 42.76196813583374 seconds
Run time for words 1000-1500: 52.21132802963257 seconds
Run time for words 1500-2000: 23.71443796157837 seconds
Run time for words 2000-2500: 25.124420166015625 seconds
Run time for words 2500-3000: 26.631723165512085 seconds
Run time for words 3000-3500: 26.81009292602539 seconds
Run time for words 3500-4000: 28.26827073097229 seconds
Run time for words 4000-4500: 35.62701725959778 seconds
Run time for words 4500-5000: 28.63114094734192 seconds
Run time for words 5000-5500: 23.340614080429077 seconds
Run time for words 5500-6000: 26.59325385093689 seconds
Run time for words 6000-6500: 30.357728958129883 seconds
Run time for words 6500-7000: 32.97431492805481 seconds
Run time for words 7000-7500: 30.38162112236023 seconds
Run time for words 7500-8000: 32.84066820144653 seconds
Total run time: 538.6252443790436 seconds
0


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_loudness,audio_ft_mode,...,tiananmen,seagrams,ragin,bitter,staff,wellve,gunners,eighth,flushing,formby
0,willow,Taylor Swift,pop,8,"['dance', 'pop']",0.392,0.574,7.0,-9.195,1.0,...,0,0,0,0,0,0,0,0,0,0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,-4.928,1.0,...,0,0,0,0,0,0,0,0,0,0
2,WITHOUT YOU,The Kid LAROI,pop,8,['australian'],0.662,0.413,0.0,-7.357,1.0,...,0,0,0,0,0,0,0,0,0,0


In [194]:
# Remove columns of words that only appear once
lyric_TF_df.loc[-1] = lyric_TF_df.sum(numeric_only=True)
lyric_TF_df.index = lyric_TF_df.index + 1  
lyric_TF_df = lyric_TF_df.sort_index()
lyric_TF_df = lyric_TF_df.drop(columns=lyric_TF_df.columns[(lyric_TF_df[0:1] == 1).any()])    
lyric_TF_df.head(3)

Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_loudness,audio_ft_mode,...,sufficient,girly,reek,duffel,apartheid,unrestrained,bitter,staff,wellve,eighth
0,,,,53468.0,,4528.6433,5362.4272,42651.0,-60641.618,5517.0,...,4.0,14.0,6.0,13.0,2.0,2.0,143.0,12.0,2.0,5.0
1,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,-9.195,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,-4.928,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [195]:
# Create filtered DataFrame with columns of words that appear at least 4 times
filtered_lyric_TF_df = lyric_TF_df.drop(columns=lyric_TF_df.columns[(lyric_TF_df[0:1]<4).any()])    
filtered_lyric_TF_df.head(3)

Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,professed,plottin,sideline,sufficient,girly,reek,duffel,bitter,staff,eighth
0,,,,53468.0,,4528.6433,5362.4272,42651.0,5517.0,665.7547,...,4.0,12.0,6.0,4.0,14.0,6.0,13.0,143.0,12.0,5.0
1,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [196]:
# Drop columns of words that are used in less than 4 songs
filtered_lyric_TF_df.loc[-1] = filtered_lyric_TF_df.astype(bool).sum(axis=0)
filtered_lyric_TF_df.index = filtered_lyric_TF_df.index + 1
filtered_lyric_TF_df = filtered_lyric_TF_df.sort_index()
filtered_lyric_TF_df = filtered_lyric_TF_df.drop(columns=filtered_lyric_TF_df.columns[(filtered_lyric_TF_df[0:1]<4).any()])    
filtered_lyric_TF_df.head(3)

Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,professed,plottin,sideline,sufficient,girly,reek,duffel,bitter,staff,eighth
0,8042,8042,8042,7618.0,8042,8042.0,8042.0,7175.0,5518.0,8042.0,...,4.0,9.0,4.0,5.0,4.0,7.0,4.0,90.0,9.0,6.0
1,,,,53468.0,,4528.6433,5362.4272,42651.0,5517.0,665.7547,...,4.0,12.0,6.0,4.0,14.0,6.0,13.0,143.0,12.0,5.0
2,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
# Save DataFrames to CSV
t0 = time.time()
filtered_lyric_TF_df.to_csv('../../Data/filtered_lyric_TF.csv', index=False)
t1= time.time()
print(f'Run time: {t1-t0} seconds')

t0 = time.time()
lyric_TF_df.to_csv('../../Data/lyric_TF.csv', index=False)
t1= time.time()
print(f'Run time: {t1-t0} seconds')

Run time: 148.65038418769836 seconds
Run time: 270.3675799369812 seconds
