# Clean Combined Dataset

Remove as many "non-vibe" words from playlist names and get word occurence counts for each word

In [1]:
import pandas as pd
import os
import json
import copy
import datetime

import re


## Load combined dataset

In [2]:
dataset_dir = os.path.join('..','..','datasets','tracks_playlist_dataset')

df_file_path = os.path.join(dataset_dir,'tracks_playlists_df.pkl')

df = pd.read_pickle(df_file_path)
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,playlist_names
0,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,1,...,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,"high, high, AUTUMN, Vampire Diaries, sleep, i ..."
1,1KHdq8NK9QxnGjdXb55NiG,Landon Pigg,The Boy Who Never,Falling in Love at a Coffee Shop,58,244986,False,0.489,0.561,4,...,1,0.0274,0.2,4.6e-05,0.179,0.238,83.457,3,acoustic,"Say You Won't Let Go, mellow, Dance, Chillin, ..."
2,2qLMf6TuEC3ruGJg4SMMN6,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,68,189613,False,0.625,0.414,0,...,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic,"Wedding, #boostyourrun, go to, Acoustic, 😍😍😍, ..."
3,3S0OXQeoh0w6AY8WQVckRW,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,75,242946,False,0.703,0.444,11,...,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic,"tb, Catchy Songs, #boostyourrun, go to, Atlas,..."
4,5TvE3pk05pyFIGdSY9j4DJ,A Great Big World;Christina Aguilera,Is There Anybody Out There? - Track by Track C...,Say Something,70,229400,False,0.407,0.147,2,...,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic,"~Rando~, go to, Solitude, Acoustic, happy, yo,..."


In [3]:
len(df)

7560

## Analyze raw words in playlist names

In [4]:
track_id = '3S0OXQeoh0w6AY8WQVckRW'
filter = df['track_id'] == track_id
row = df[filter].iloc[0]
playlist_names = row['playlist_names']
playlist_names = playlist_names.split(',')

In [5]:
playlist_names[0:10]

['tb',
 ' Catchy Songs',
 ' #boostyourrun',
 ' go to',
 ' Atlas',
 ' throwback',
 ' Acoustic',
 ' ((chris))',
 ' throw backs',
 ' Throwbacks ']

In [6]:
playlist_names[-10:]

[' Shower',
 ' throwback ',
 ' Stuff I like',
 ' Classics',
 ' good times',
 ' Throwback',
 ' Songs that never fail to make white people beyond turnt',
 ' kareoke',
 ' I love You',
 ' Lake']

In [7]:
track_id = '5TvE3pk05pyFIGdSY9j4DJ'
filter = df['track_id'] == track_id
row = df[filter].iloc[0]
playlist_names = row['playlist_names']
playlist_names = playlist_names.split(',')

In [8]:
playlist_names[0:10]

['~Rando~',
 ' go to',
 ' Solitude',
 ' Acoustic',
 ' happy',
 ' yo',
 ' my heart',
 ' Isis',
 ' Top Hits',
 ' Mya']

In [9]:
playlist_names[-10:]

[' Depressing songs',
 ' Easy Listening',
 ' GRAD',
 ' L.o.v.e',
 ' Ballads',
 ' Inside Out: So Emotional',
 ' Slow',
 ' feels',
 ' Sleep',
 ' sad times']

## Clean playlist names

In [10]:
# Articles
articles = [
    "a", "an", "the"
]

# Common Prepositions
# prepositions = [
#     "about", "above", "across", "after", "against", "along", "among",
#     "around", "at", "before", "behind", "below", "beneath", "beside",
#     "besides", "between", "beyond", "but", "by", "concerning", "considering",
#     "despite", "down", "during", "except", "excepting", "excluding",
#     "following", "for", "from", "in", "inside", "into", "like", "minus",
#     "near", "of", "off", "on", "onto", "opposite", "outside", "over", "past",
#     "per", "plus", "regarding", "round", "save", "since", "than", "through",
#     "to", "toward", "towards", "under", "underneath", "unlike", "until",
#     "up", "upon", "versus", "via", "with", "within", "without"
# ]

prepositions = [
    "about", "above", "across", "after", "against", "along", "among",
    "around", "at", "before", "behind", "below", "beneath", "beside",
    "besides", "between", "beyond", "but", "by", "concerning", "considering",
    "despite", "down", "during", "except", "excepting", "excluding",
    "following", "for", "from", "in", "inside", "into", "like", "minus",
    "near", "of", "off", "on", "onto", "outside", "over",
    "per", "plus", "regarding", "round", "since", "than", "through",
    "to", "versus", "via", "with", "within", "without"
]

# Pronouns (personal, possessive, reflexive, demonstrative, relative, interrogative, indefinite)
pronouns = [
    # Personal
    "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
    # Possessive
    "my", "mine", "your", "yours", "his", "her", "hers", "its", "our", "ours", "their", "theirs",
    # Reflexive
    "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves",
    # Demonstrative
    "this", "that", "these", "those",
    # Relative
    "who", "whom", "whose", "which", "that",
    # Interrogative
    "what", "which", "who", "whom", "whose",
    # Indefinite
    "anybody", "anyone", "anything", "each", "either", "everybody", "everyone", "everything",
    "neither", "nobody", "no one", "nothing", "one", "somebody", "someone", "something",
    "both", "few", "many", "several", "all", "any", "most", "none", "some"
]


In [11]:
# remove synonyms for music/songs
# music_stopwords = [
#     # General music terms
#     "music", "song", "songs", "track", "tracks", "tune", "tunes",
#     "melody", "melodies", "rhythm", "harmony", "lyrics",
    
#     # Album / playlist words
#     "playlist", "mix", "compilation", "collection", "set", "jam", "jams",
#     "record", "records", "album", "albums", "single", "singles", "ep", "lp",
    
#     # Performance terms
#     "band", "bands", "group", "groups", "orchestra", "choir", "ensemble",
#     "performance", "performances", "concert", "gig", "show", "live",
    
#     # Listening context
#     "listen", "listening", "play", "played", "plays", "playing",
#     "sound", "sounds", "audio",
    
#     # Time/context in music
#     "remix", "remixes", "cover", "covers", "version", "versions",
#     "original", "edit", "edits", "demo", "demos",
    
#     # Streaming platform common words
#     "radio", "station", "stations", "session", "sessions",
    
#     # Music role terms
#     "dj", "producer", "production", "artist", "artists", "musician", "musicians",
    
#     # Genre meta-words (not actual genres)
#     "hit", "hits", "chart", "charts", "top", "best", "greatest", "favorites", "favourite",
#     "new", "latest", "classic", "classics", "oldies"
# ]

music_stopwords = [
    # General music terms
    "music", "song", "songs", "track", "tracks", "tune", "tunes",
    "melody", "melodies", "rhythm", "harmony", "lyrics",
    
    # Album / playlist words
    "playlist", "mix", "compilation", "collection", "set", "jam", "jams",
    "record", "records", "album", "albums", "single", "singles", "ep", "lp",
    
    # Performance terms
    "band", "bands", "group", "groups", "orchestra", "choir", "ensemble",
    "performance", "performances", "concert", "gig", "show", "live",
    
    # Listening context
    "listen", "listening", "play", "played", "plays", "playing",
    "sound", "sounds", "audio",
    
    # Time/context in music
    "remix", "remixes", "cover", "covers", "version", "versions",
    "original", "edit", "edits", "demo", "demos",
    
    # Streaming platform common words
    "radio", "station", "stations", "session", "sessions",
    
    # Music role terms
    "dj", "producer", "production", "artist", "artists", "musician", "musicians",
    
    # Genre meta-words (not actual genres)
    "hit", "hits", "chart", "charts", "top", "best", "greatest", "favorites", "favourite",
    "new"
]

In [12]:
# exclude feels words

# emotion_words = [
#     # Feelings (general emotional states)
#     "emotions", "sentiments", "sensations", "reactions", "responses",
#     "passions", "affection", "affects", "attitudes", "vibes",
    
#     # Mood (emotional tone)
#     "temper", "disposition", "frame_of_mind", "outlook", "mindset",
#     "spirit", "tone", "ambience", "atmosphere", "energy",
    
#     # Colloquial / modern terms
#     "vibes", "aura", "feels", "headspace", "energy",
    
#     # More poetic/formal variants
#     "humor", "mien", "temperament", "sentiment", "state_of_mind",
#     "air", "bearing", "character"
# ]

emotion_words = [
    "emotions", "emotion",
    "feelings", "feeling",
    "attitude", "attitudes", 
    "vibe", "vibes", 
    "feel", "feels", "headspace",
    "character", "mood", "moody"
]

In [13]:
def clean_playlist_names(track_id):
    # find track_id in DataFrame
    filter = df['track_id'] == track_id
    row = df[filter].iloc[0]

    # get playlist names 
    playlist_names = row['playlist_names']

    # convert to a list
    playlist_names = playlist_names.split(',')

    # separate into distinct words
    playlist_words = []
    for name in playlist_names:
        # convert to lower case
        n = name.lower()

        # remove symbols and emojis
        n = re.sub(r"[^\w\s]", "", n, flags=re.UNICODE)

        # remove all numbers
        n = re.sub(r'\d+', '', n)  # Remove all digits

        # remove '_' character
        n = n.replace("_", "")
        
        # split based on spaces
        n = n.split(' ')
        
        for word in n:
            # exclude articles, prepositions, pronouns
            exc0 = len(word) <= 1
            exc1 = word in articles
            exc2 = word in prepositions
            exc3 = word in pronouns

            # exclude music stop words
            exc4 = word in music_stopwords

            # exclude emotion words
            exc5 = word in emotion_words

            word_ok = not (exc0 or exc1 or exc2 or exc3 or exc4 or
                           exc5)
            
            if word_ok:
                playlist_words.append(word)

    return playlist_words

In [14]:
playlist_words = clean_playlist_names(track_id=track_id)

In [15]:
playlist_words[:20]

['rando',
 'go',
 'solitude',
 'acoustic',
 'happy',
 'yo',
 'heart',
 'isis',
 'mya',
 'hayley',
 'chill',
 'chilly',
 'other',
 'breathe',
 'jens',
 'fallen',
 'run',
 'sad',
 'quiet',
 'pure']

In [16]:
playlist_words[-20:]

['jared',
 'cry',
 'love',
 'together',
 'confidence',
 'ds',
 'let',
 'go',
 'depressing',
 'easy',
 'grad',
 'love',
 'ballads',
 'out',
 'so',
 'emotional',
 'slow',
 'sleep',
 'sad',
 'times']

In [17]:
playlist_words = clean_playlist_names(track_id='5TvE3pk05pyFIGdSY9j4DJ')

In [18]:
playlist_words[:20]

['rando',
 'go',
 'solitude',
 'acoustic',
 'happy',
 'yo',
 'heart',
 'isis',
 'mya',
 'hayley',
 'chill',
 'chilly',
 'other',
 'breathe',
 'jens',
 'fallen',
 'run',
 'sad',
 'quiet',
 'pure']

In [19]:
playlist_words[-20:]

['jared',
 'cry',
 'love',
 'together',
 'confidence',
 'ds',
 'let',
 'go',
 'depressing',
 'easy',
 'grad',
 'love',
 'ballads',
 'out',
 'so',
 'emotional',
 'slow',
 'sleep',
 'sad',
 'times']

In [20]:
len(playlist_words)

10444

In [21]:
len(set(playlist_words))

2102

In [22]:
# TODO: remove 's' from plural forms of words

## Get word bin counts for each track

In [23]:
# for each track_id
track_id = '3S0OXQeoh0w6AY8WQVckRW'

# get clean playlist words
playlist_words = clean_playlist_names(track_id=track_id)

# get unique playlist words
unique_words = set(playlist_words)

# create a dictionary with each unique word as a key with value = 0
word_bins = {}
for word in unique_words:
    word_bins[word] = 0

# go through the clean playlist words and tabulate using the dictionary
for word in playlist_words:
    word_bins[word] += 1

# convert into a list of words sorted by bin count
sorted_items = sorted(word_bins.items(), key=lambda x: x[1], reverse=True)


In [24]:
sorted_items[:20]

[('chill', 881),
 ('wedding', 682),
 ('love', 659),
 ('throwback', 656),
 ('good', 577),
 ('throwbacks', 341),
 ('happy', 327),
 ('summer', 285),
 ('pop', 266),
 ('car', 250),
 ('party', 246),
 ('beach', 207),
 ('old', 201),
 ('road', 188),
 ('oldies', 165),
 ('sing', 152),
 ('dinner', 146),
 ('trip', 145),
 ('shower', 145),
 ('mellow', 140)]

In [25]:
sorted_items[-20:]

[('hiking', 1),
 ('guard', 1),
 ('isabella', 1),
 ('annas', 1),
 ('brain', 1),
 ('british', 1),
 ('machine', 1),
 ('flora', 1),
 ('robert', 1),
 ('demons', 1),
 ('native', 1),
 ('mystic', 1),
 ('garrett', 1),
 ('billboard', 1),
 ('uh', 1),
 ('beatriz', 1),
 ('contry', 1),
 ('pumpkin', 1),
 ('mice', 1),
 ('breath', 1)]

In [26]:
len(sorted_items)

2666

In [27]:
# Figure out how many unique words are there in all playlists for all tracks
global_words = []
start_time = datetime.datetime.now()
total_rows = len(df['track_id'].unique())
t = 0
p=0
for track_id in df['track_id'].unique():
    # get clean playlist words
    playlist_words = clean_playlist_names(track_id=track_id)

    # get unique playlist words
    unique_words = set(playlist_words)

    for w in unique_words:
        if w not in global_words:
            global_words.append(w)
    t += 1
    perc_complete = t*100/total_rows
    if perc_complete >= p:
        print(f'{perc_complete:.2f}%, {datetime.datetime.now()-start_time}')
        p += 1

0.02%, 0:00:00.013000
1.01%, 0:00:02.246733
2.01%, 0:00:02.917110
3.01%, 0:00:03.317543
4.01%, 0:00:07.852257
5.00%, 0:00:11.423606
6.02%, 0:00:14.588035
7.01%, 0:00:16.362815
8.01%, 0:00:16.831348
9.01%, 0:00:17.112353
10.00%, 0:00:17.233705
11.00%, 0:00:17.416267
12.02%, 0:00:17.629667
13.01%, 0:00:17.767971
14.01%, 0:00:19.196945
15.01%, 0:00:22.334762
16.00%, 0:00:23.770836
17.00%, 0:00:24.439082
18.02%, 0:00:24.606354
19.01%, 0:00:26.006368
20.01%, 0:00:27.006629
21.01%, 0:00:27.579693
22.00%, 0:00:27.839629
23.00%, 0:00:27.944414
24.02%, 0:00:28.045510
25.01%, 0:00:28.890524
26.01%, 0:00:29.090356
27.01%, 0:00:29.246164
28.00%, 0:00:29.636769
29.00%, 0:00:32.740201
30.01%, 0:00:39.602707
31.01%, 0:00:45.308816
32.01%, 0:00:45.693456
33.01%, 0:00:45.823037
34.00%, 0:00:46.227992
35.00%, 0:00:46.675416
36.01%, 0:00:48.327212
37.01%, 0:00:49.346959
38.01%, 0:00:50.645086
39.01%, 0:00:50.917998
40.00%, 0:00:51.249263
41.00%, 0:00:51.712976
42.01%, 0:00:55.669815
43.01%, 0:01:00.49073

In [28]:
len(global_words)

9723

In [33]:
data = {
    "global": {},
    "track_id":{}
}
for word in global_words:
    data["global"][word] = 0

start_time = datetime.datetime.now()
total_rows = len(df['track_id'].unique())
t = 0
p = 0

for track_id in df['track_id'].unique():
    # get word bin counts for each track_id
    # get clean playlist words
    playlist_words = clean_playlist_names(track_id=track_id)
    
    # get unique playlist words
    unique_words = set(playlist_words)

    # create a dictionary with each unique word as a key with value = 0
    data["track_id"][track_id] = {}
    for word in unique_words:
        data["track_id"][track_id][word] = 0
    
    # go through the clean playlist words and tabulate using the dictionary
    # tally global words as well
    for word in playlist_words:
        data["track_id"][track_id][word] += 1
        data["global"][word] += 1

    t += 1
    perc_complete = t*100/total_rows
    if perc_complete >= p:
        print(f'{perc_complete:.2f}%, {datetime.datetime.now()-start_time}')
        p += 1




0.02%, 0:00:00.015910
1.01%, 0:00:01.476617
2.01%, 0:00:01.877379
3.01%, 0:00:02.118952
4.01%, 0:00:04.914080
5.00%, 0:00:07.219510
6.02%, 0:00:09.034866
7.01%, 0:00:09.985443
8.01%, 0:00:10.236704
9.01%, 0:00:10.403715
10.00%, 0:00:10.485625
11.00%, 0:00:10.589262
12.02%, 0:00:10.722454
13.01%, 0:00:10.798887
14.01%, 0:00:11.708025
15.01%, 0:00:13.622438
16.00%, 0:00:14.440988
17.00%, 0:00:14.798887
18.02%, 0:00:14.889885
19.01%, 0:00:15.724543
20.01%, 0:00:16.257983
21.01%, 0:00:16.547114
22.00%, 0:00:16.691810
23.00%, 0:00:16.746438
24.02%, 0:00:16.810184
25.01%, 0:00:17.259328
26.01%, 0:00:17.359446
27.01%, 0:00:17.476785
28.00%, 0:00:17.697829
29.00%, 0:00:19.782915
30.01%, 0:00:24.232955
31.01%, 0:00:27.839646
32.01%, 0:00:28.041192
33.01%, 0:00:28.103587
34.00%, 0:00:28.354712
35.00%, 0:00:28.573689
36.01%, 0:00:29.605037
37.01%, 0:00:30.157107
38.01%, 0:00:31.073474
39.01%, 0:00:31.261362
40.00%, 0:00:31.415240
41.00%, 0:00:31.658611
42.01%, 0:00:33.981577
43.01%, 0:00:36.68668

In [34]:
# save to a single json
with open("word_count_data.json","w") as f:
    json.dump(data,f)