In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import re
import time

In [2]:
# Create DataFrame from CSV file
nlp_df = pd.read_csv('../Data/nlp_df.csv')
nlp_df = nlp_df.drop(['non_alpha_words', 'words'], axis=1)
nlp_df.head(1)

Unnamed: 0,song,song_id,artist,artist_id,category,category_id,genres,audio_ft_danceability,audio_ft_energy,audio_ft_key,...,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,audio_ft_duration_ms,audio_ft_time_signature,lyrics,filtered,language
0,willow,0lx2cLdOt3piJbcaXIV74f,Taylor Swift,06HL4z0CvFAxyc27GXpf02,pop,8,"['dance', 'pop']",0.392,0.574,7.0,...,0.833,0.00179,0.145,0.529,81.112,214707.0,4.0,Im like the water when your ship rolled in th...,"['', 'im', 'like', 'water', 'ship', 'rolled', ...","(1, 'en')"


In [3]:
# Create a list of all words, unique word counts, and filtered words
t0 = time.time()
words_list = []
unique_word_counts = []
filtered_words_list = []
for index, row in nlp_df.iterrows():
    filtered_words = row['filtered']
    filtered_words = filtered_words.replace(',', '').replace("'", '')
    filtered_words = filtered_words.replace('[', '').replace(']', '')
    filtered_words = filtered_words.replace('#', '').replace('&nbsp', '')
    filtered_words = filtered_words.replace('?', '? ').replace(',', '')
    filtered_words = filtered_words.replace('/', ' ')
    filtered_words = filtered_words.replace('\\u200a', '').replace('\\u200b', '')
    filtered_words = filtered_words.replace('\\u2063', '').replace('\u202f', '')
    filtered_words = filtered_words.replace('\\u2028', ' ').replace('\\u2008', ' ')
    while ('\\u200e' in filtered_words) or ('\\xa0' in filtered_words):
        filtered_words = filtered_words.replace('\\u200e', '')
        filtered_words = filtered_words.replace('\\xa0', '')
    filtered_words_list.append(filtered_words)
    unique_words = list(set(filtered_words.strip().split(' ')))
    unique_word_counts.append(len(unique_words))
    words_list.extend(unique_words)
word_columns = list(set(words_list))
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
len(word_columns)

Run time: 1.4278368949890137 seconds


39244

In [4]:
# Add a unique_word_count column and replace the filtered column
nlp_df['unique_word_count'] = unique_word_counts
nlp_df['filtered'] = filtered_words_list
nlp_df.head(1)

Unnamed: 0,song,song_id,artist,artist_id,category,category_id,genres,audio_ft_danceability,audio_ft_energy,audio_ft_key,...,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,audio_ft_duration_ms,audio_ft_time_signature,lyrics,filtered,language,unique_word_count
0,willow,0lx2cLdOt3piJbcaXIV74f,Taylor Swift,06HL4z0CvFAxyc27GXpf02,pop,8,"['dance', 'pop']",0.392,0.574,7.0,...,0.00179,0.145,0.529,81.112,214707.0,4.0,Im like the water when your ship rolled in th...,im like water ship rolled night rough surface...,"(1, 'en')",87


In [5]:
# Remove songs with less than 25 unique words
nlp_df = nlp_df[nlp_df['unique_word_count']>=25]
len(nlp_df)

8115

In [6]:
# Update word list
words_list = []
for index, row in nlp_df.iterrows():
    filtered_words = row['filtered']
    unique_words = list(set(filtered_words.strip().split(' ')))
    words_list.extend(unique_words)
word_columns = list(set(words_list))
len(word_columns)

39099

# Try to remove non English songs

In [7]:
# Find songs indeces with one language that is not English
one_lang_not_en = []
mult_langs_not_en = []
for index, row in nlp_df.iterrows():
    if (row['language'][1] == '1') & (row['language'][5:7] != 'en'):
        one_lang_not_en.append(index)
    if (row['language'][1] != '1') & (row['language'][5:7] != 'en'):
        mult_langs_not_en.append(index)
print(len(one_lang_not_en))
len(mult_langs_not_en)

105


103

In [8]:
# Remove non English songs
songs_to_keep = [78, 452, 715, 878, 1633, 2150, 2402, 2407, 2427, 2526, 2835, 2979, 3232,
                 3359, 3791, 4368, 4643, 5692, 6328, 6379, 6387, 6523, 6722, 7853]
songs_to_remove = [264, 924, 1610, 1618, 2299, 2368, 3688, 5482, 5656, 5658, 5688, 5695, 
                   5710, 5715, 5719, 5720, 5750, 5792, 6385]
songs_to_remove.extend([item for item in one_lang_not_en if item not in songs_to_keep])
nlp_df = nlp_df.drop(songs_to_remove)
print(len(songs_to_remove))
len(nlp_df)

100


8015

# Examine unique word counts by genre

In [9]:
# Remove the language column
nlp_df = nlp_df.drop('language', axis=1)

In [10]:
# Drop songs with incorrect lyrics from the DataFrame
nlp_df = nlp_df[(nlp_df['unique_word_count']!=3878) & (nlp_df['unique_word_count']!=1153)]
nlp_df = nlp_df[(nlp_df['unique_word_count']!=1000) & (nlp_df['unique_word_count']!=880)]
nlp_df = nlp_df[(nlp_df['unique_word_count']!=842) & (nlp_df['unique_word_count']!=607)]
nlp_df = nlp_df[nlp_df['unique_word_count']!=569]
#nlp_df = nlp_df.drop([4242, 5672, 5666, 2443, 2460, 3510, 1374, 3539, 1237])
len(nlp_df)

8006

In [11]:
# Describe the distribution of unique word counts for each cateogry
word_distributions = {}
blues_unique_words = nlp_df[nlp_df['category']=='blues']['unique_word_count'].describe()
word_distributions['blues'] = blues_unique_words

classical_unique_words = nlp_df[nlp_df['category']=='classical']['unique_word_count'].describe()
word_distributions['classical'] = classical_unique_words

country_unique_words = nlp_df[nlp_df['category']=='country']['unique_word_count'].describe()
word_distributions['country'] = country_unique_words

funk_unique_words = nlp_df[nlp_df['category']=='funk']['unique_word_count'].describe()
word_distributions['funk'] = funk_unique_words

hiphop_unique_words = nlp_df[nlp_df['category']=='hiphop']['unique_word_count'].describe()
word_distributions['hiphop'] = hiphop_unique_words

indie_alt_unique_words = nlp_df[nlp_df['category']=='indie_alt']['unique_word_count'].describe()
word_distributions['indie_alt'] = indie_alt_unique_words

jazz_unique_words = nlp_df[nlp_df['category']=='jazz']['unique_word_count'].describe()
word_distributions['jazz'] = jazz_unique_words

metal_unique_words = nlp_df[nlp_df['category']=='metal']['unique_word_count'].describe()
word_distributions['metal'] = metal_unique_words

pop_unique_words = nlp_df[nlp_df['category']=='pop']['unique_word_count'].describe()
word_distributions['pop'] = pop_unique_words

punk_unique_words = nlp_df[nlp_df['category']=='punk']['unique_word_count'].describe()
word_distributions['punk'] = punk_unique_words

rnb_unique_words = nlp_df[nlp_df['category']=='rnb']['unique_word_count'].describe()
word_distributions['rnb'] = rnb_unique_words

rock_unique_words = nlp_df[nlp_df['category']=='rock']['unique_word_count'].describe()
word_distributions['rock'] = rock_unique_words

romance_unique_words = nlp_df[nlp_df['category']=='romance']['unique_word_count'].describe()
word_distributions['romance'] = romance_unique_words

soul_unique_words = nlp_df[nlp_df['category']=='soul']['unique_word_count'].describe()
word_distributions['soul'] = soul_unique_words
len(word_distributions)

14

In [12]:
# Print the distributions for each genre
word_distributions

{'blues': count    424.000000
 mean      53.707547
 std       19.685094
 min       25.000000
 25%       39.000000
 50%       51.000000
 75%       64.000000
 max      197.000000
 Name: unique_word_count, dtype: float64,
 'classical': count     81.000000
 mean      75.555556
 std       63.477161
 min       25.000000
 25%       44.000000
 50%       56.000000
 75%       85.000000
 max      503.000000
 Name: unique_word_count, dtype: float64,
 'country': count    1257.000000
 mean       74.820207
 std        24.007187
 min        25.000000
 25%        60.000000
 50%        73.000000
 75%        87.000000
 max       264.000000
 Name: unique_word_count, dtype: float64,
 'funk': count    281.000000
 mean      73.259786
 std       38.393733
 min       26.000000
 25%       50.000000
 50%       63.000000
 75%       84.000000
 max      257.000000
 Name: unique_word_count, dtype: float64,
 'hiphop': count    681.000000
 mean     173.947137
 std       75.049573
 min       28.000000
 25%      123.000

In [13]:
# Create a dictionary of mean unique word distributions
categories = list(word_distributions.keys())
means = []
for category in word_distributions:
    means.append(word_distributions[category]['mean'])
mean_unique_word_counts = dict(zip(categories, means))
mean_unique_word_counts = dict(sorted(mean_unique_word_counts.items(), key=lambda item: item[1], reverse=True))
mean_unique_word_counts

{'hiphop': 173.94713656387665,
 'rnb': 85.18508287292818,
 'classical': 75.55555555555556,
 'country': 74.82020684168656,
 'funk': 73.25978647686833,
 'soul': 70.64476885644768,
 'punk': 70.07029876977153,
 'pop': 69.63386396526772,
 'metal': 69.0347533632287,
 'rock': 63.481632653061226,
 'jazz': 60.515306122448976,
 'romance': 59.55913978494624,
 'indie_alt': 57.928462709284624,
 'blues': 53.70754716981132}

In [20]:
# Find songs with the highest and lowest unique word counts
print('Song with the most unique words:')
display(nlp_df[nlp_df['unique_word_count']==max(nlp_df['unique_word_count'])][['song', 'artist', 'category', 'lyrics', 'unique_word_count']])

print('Genres with songs that have the minimum unique word count of 25 words:')
print(set(nlp_df[nlp_df['unique_word_count']==min(nlp_df['unique_word_count'])]['category']), '\n')

min_word_count_df = nlp_df[nlp_df['unique_word_count']==min(nlp_df['unique_word_count'])].head()
print(f'{len(min_word_count_df)} songs have the minimum unique word count of 25 words.')


Song with the most unique words:


Unnamed: 0,song,artist,category,lyrics,unique_word_count
1268,Momentum (feat. Black Thought & Benny The Butc...,Russ,hiphop,Yeah yeah Dont compare me to employees who ...,544


Genres with songs that have the minimum unique word count of 25 words:
{'punk', 'classical', 'indie_alt', 'blues', 'country', 'rock', 'metal', 'romance'} 

5 songs have the minimum unique word count of 25 words.


In [21]:
# Create DataFrame of unique word count distributions
ranges = []
for category in word_distributions:
    ranges.append(word_distributions[category]['max'] - word_distributions[category]['min'])

columns = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'range']
df_rows = []
for index, category in enumerate(word_distributions):
    distribution = list(word_distributions[category])
    distribution.extend([ranges[index]])
    df_rows.append(distribution)
word_count_df = pd.DataFrame(df_rows, index=list(word_distributions.keys()), columns=columns)
word_count_df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,range
blues,424.0,53.707547,19.685094,25.0,39.0,51.0,64.0,197.0,172.0
classical,81.0,75.555556,63.477161,25.0,44.0,56.0,85.0,503.0,478.0
country,1257.0,74.820207,24.007187,25.0,60.0,73.0,87.0,264.0,239.0
funk,281.0,73.259786,38.393733,26.0,50.0,63.0,84.0,257.0,231.0
hiphop,681.0,173.947137,75.049573,28.0,123.0,166.0,216.0,544.0,516.0
indie_alt,657.0,57.928463,24.795432,25.0,41.0,54.0,68.0,253.0,228.0
jazz,196.0,60.515306,42.296448,26.0,39.0,51.0,65.0,349.0,323.0
metal,892.0,69.034753,26.674618,25.0,51.0,64.0,82.0,220.0,195.0
pop,691.0,69.633864,30.84948,26.0,52.5,64.0,78.0,360.0,334.0
punk,569.0,70.070299,30.317486,25.0,51.0,66.0,82.0,315.0,290.0


In [22]:
# Save DataFrame to CSV
word_count_df.to_csv('../Data/word_count.csv')

# Conclusions about unique word distributions by genre
- The hiphop genre has the highest mean unique word count of 176 words
- The blues genre has the lowest mean unique word count of 54 words
- Genres with the highest average unique word counts:
    - Hiphop: 174 unique words
    - R&B: 85 unique words
    - Classical: 76 unique words
- Genres with the lowest average unique word counts:
    - Blues: 54 unique words
    - Indie-alt: 58 unique words
    - Jazz: 61 unique words
- The song with the highest unique word count is in the romance category:
    - Song name: Momentum 
    - Artist: Russ
    - Unique word count: 544 words
- The indie-alt genre had the most songs (8) with the minimum unique word count of 25 words
    - The rock genre had the second most songs (5) with the minimum unique word count
- The hiphop, R&B, funk, soul, pop, and jazz genres had the least songs (0) with the minimum unique word count of 25 words
    - The classical and punk genres had the second least songs (1) with the minimum unique word count