# Clean Combined Dataset

Remove as many "non-vibe" words from playlist names and get word occurence counts for each word

In [1]:
import pandas as pd
import os
import json
import copy
import datetime

import re


## Load combined dataset

In [2]:
dataset_dir = os.path.join('..','..','datasets','tracks_playlist_dataset')

df_file_path = os.path.join(dataset_dir,'tracks_playlists_df.pkl')

df = pd.read_pickle(df_file_path)
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,playlist_names
0,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,1,...,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,"high, high, AUTUMN, Vampire Diaries, sleep, i ..."
1,1KHdq8NK9QxnGjdXb55NiG,Landon Pigg,The Boy Who Never,Falling in Love at a Coffee Shop,58,244986,False,0.489,0.561,4,...,1,0.0274,0.2,4.6e-05,0.179,0.238,83.457,3,acoustic,"Say You Won't Let Go, mellow, Dance, Chillin, ..."
2,2qLMf6TuEC3ruGJg4SMMN6,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,68,189613,False,0.625,0.414,0,...,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic,"Wedding, #boostyourrun, go to, Acoustic, 😍😍😍, ..."
3,3S0OXQeoh0w6AY8WQVckRW,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,75,242946,False,0.703,0.444,11,...,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic,"tb, Catchy Songs, #boostyourrun, go to, Atlas,..."
4,5TvE3pk05pyFIGdSY9j4DJ,A Great Big World;Christina Aguilera,Is There Anybody Out There? - Track by Track C...,Say Something,70,229400,False,0.407,0.147,2,...,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic,"~Rando~, go to, Solitude, Acoustic, happy, yo,..."


In [3]:
len(df)

7560

## Analyze raw words in playlist names

In [4]:
track_id = '3S0OXQeoh0w6AY8WQVckRW'
filter = df['track_id'] == track_id
row = df[filter].iloc[0]
playlist_names = row['playlist_names']
playlist_names = playlist_names.split(',')

In [5]:
playlist_names[0:10]

['tb',
 ' Catchy Songs',
 ' #boostyourrun',
 ' go to',
 ' Atlas',
 ' throwback',
 ' Acoustic',
 ' ((chris))',
 ' throw backs',
 ' Throwbacks ']

In [6]:
playlist_names[-10:]

[' Shower',
 ' throwback ',
 ' Stuff I like',
 ' Classics',
 ' good times',
 ' Throwback',
 ' Songs that never fail to make white people beyond turnt',
 ' kareoke',
 ' I love You',
 ' Lake']

In [7]:
track_id = '5TvE3pk05pyFIGdSY9j4DJ'
filter = df['track_id'] == track_id
row = df[filter].iloc[0]
playlist_names = row['playlist_names']
playlist_names = playlist_names.split(',')

In [8]:
playlist_names[0:10]

['~Rando~',
 ' go to',
 ' Solitude',
 ' Acoustic',
 ' happy',
 ' yo',
 ' my heart',
 ' Isis',
 ' Top Hits',
 ' Mya']

In [9]:
playlist_names[-10:]

[' Depressing songs',
 ' Easy Listening',
 ' GRAD',
 ' L.o.v.e',
 ' Ballads',
 ' Inside Out: So Emotional',
 ' Slow',
 ' feels',
 ' Sleep',
 ' sad times']

## Clean playlist names

In [10]:
# Articles
articles = [
    "a", "an", "the"
]

# Common Prepositions
# prepositions = [
#     "about", "above", "across", "after", "against", "along", "among",
#     "around", "at", "before", "behind", "below", "beneath", "beside",
#     "besides", "between", "beyond", "but", "by", "concerning", "considering",
#     "despite", "down", "during", "except", "excepting", "excluding",
#     "following", "for", "from", "in", "inside", "into", "like", "minus",
#     "near", "of", "off", "on", "onto", "opposite", "outside", "over", "past",
#     "per", "plus", "regarding", "round", "save", "since", "than", "through",
#     "to", "toward", "towards", "under", "underneath", "unlike", "until",
#     "up", "upon", "versus", "via", "with", "within", "without"
# ]

prepositions = [
    "about", "above", "across", "after", "against", "along", "among",
    "around", "at", "before", "behind", "below", "beneath", "beside",
    "besides", "between", "beyond", "but", "by", "concerning", "considering",
    "despite", "down", "during", "except", "excepting", "excluding",
    "following", "for", "from", "in", "inside", "into", "like", "minus",
    "near", "of", "off", "on", "onto", "outside", "over",
    "per", "plus", "regarding", "round", "since", "than", "through",
    "to", "versus", "via", "with", "within", "without"
]

# Pronouns (personal, possessive, reflexive, demonstrative, relative, interrogative, indefinite)
pronouns = [
    # Personal
    "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
    # Possessive
    "my", "mine", "your", "yours", "his", "her", "hers", "its", "our", "ours", "their", "theirs",
    # Reflexive
    "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves",
    # Demonstrative
    "this", "that", "these", "those",
    # Relative
    "who", "whom", "whose", "which", "that",
    # Interrogative
    "what", "which", "who", "whom", "whose",
    # Indefinite
    "anybody", "anyone", "anything", "each", "either", "everybody", "everyone", "everything",
    "neither", "nobody", "no one", "nothing", "one", "somebody", "someone", "something",
    "both", "few", "many", "several", "all", "any", "most", "none", "some"
]


In [11]:
# remove synonyms for music/songs
# music_stopwords = [
#     # General music terms
#     "music", "song", "songs", "track", "tracks", "tune", "tunes",
#     "melody", "melodies", "rhythm", "harmony", "lyrics",
    
#     # Album / playlist words
#     "playlist", "mix", "compilation", "collection", "set", "jam", "jams",
#     "record", "records", "album", "albums", "single", "singles", "ep", "lp",
    
#     # Performance terms
#     "band", "bands", "group", "groups", "orchestra", "choir", "ensemble",
#     "performance", "performances", "concert", "gig", "show", "live",
    
#     # Listening context
#     "listen", "listening", "play", "played", "plays", "playing",
#     "sound", "sounds", "audio",
    
#     # Time/context in music
#     "remix", "remixes", "cover", "covers", "version", "versions",
#     "original", "edit", "edits", "demo", "demos",
    
#     # Streaming platform common words
#     "radio", "station", "stations", "session", "sessions",
    
#     # Music role terms
#     "dj", "producer", "production", "artist", "artists", "musician", "musicians",
    
#     # Genre meta-words (not actual genres)
#     "hit", "hits", "chart", "charts", "top", "best", "greatest", "favorites", "favourite",
#     "new", "latest", "classic", "classics", "oldies"
# ]

music_stopwords = [
    # General music terms
    "music", "song", "songs", "track", "tracks", "tune", "tunes",
    "melody", "melodies", "rhythm", "harmony", "lyrics",
    
    # Album / playlist words
    "playlist", "mix", "compilation", "collection", "set", "jam", "jams",
    "record", "records", "album", "albums", "single", "singles", "ep", "lp",
    
    # Performance terms
    "band", "bands", "group", "groups", "orchestra", "choir", "ensemble",
    "performance", "performances", "concert", "gig", "show", "live",
    
    # Listening context
    "listen", "listening", "play", "played", "plays", "playing",
    "sound", "sounds", "audio",
    
    # Time/context in music
    "remix", "remixes", "cover", "covers", "version", "versions",
    "original", "edit", "edits", "demo", "demos",
    
    # Streaming platform common words
    "radio", "station", "stations", "session", "sessions",
    
    # Music role terms
    "dj", "producer", "production", "artist", "artists", "musician", "musicians",
    
    # Genre meta-words (not actual genres)
    "hit", "hits", "chart", "charts", "top", "best", "greatest", "favorites", "favourite",
    "new",

    "spotify", "spotifys"
]

In [12]:
# exclude feels words

# emotion_words = [
#     # Feelings (general emotional states)
#     "emotions", "sentiments", "sensations", "reactions", "responses",
#     "passions", "affection", "affects", "attitudes", "vibes",
    
#     # Mood (emotional tone)
#     "temper", "disposition", "frame_of_mind", "outlook", "mindset",
#     "spirit", "tone", "ambience", "atmosphere", "energy",
    
#     # Colloquial / modern terms
#     "vibes", "aura", "feels", "headspace", "energy",
    
#     # More poetic/formal variants
#     "humor", "mien", "temperament", "sentiment", "state_of_mind",
#     "air", "bearing", "character"
# ]

emotion_words = [
    "emotions", "emotion",
    "feelings", "feeling",
    "attitude", "attitudes", 
    "vibe", "vibes", 
    "feel", "feels", "headspace",
    "character", "mood", "moody"
]

In [13]:
def clean_playlist_names(track_id):
    # find track_id in DataFrame
    filter = df['track_id'] == track_id
    row = df[filter].iloc[0]

    # get playlist names 
    playlist_names = row['playlist_names']

    # convert to a list
    playlist_names = playlist_names.split(',')

    # separate into distinct words
    playlist_words = []
    for name in playlist_names:
        # convert to lower case
        n = name.lower()

        # remove symbols and emojis
        n = re.sub(r"[^\w\s]", "", n, flags=re.UNICODE)

        # remove all numbers
        n = re.sub(r'\d+', '', n)  # Remove all digits

        # remove '_' character
        n = n.replace("_", "")
        
        # split based on spaces
        n = n.split(' ')
        
        for word in n:
            # exclude articles, prepositions, pronouns
            exc0 = len(word) <= 2
            exc1 = word in articles
            exc2 = word in prepositions
            exc3 = word in pronouns

            # exclude music stop words
            exc4 = word in music_stopwords

            # exclude emotion words
            exc5 = word in emotion_words

            word_ok = not (exc0 or exc1 or exc2 or exc3 or exc4 or
                           exc5)
            
            if word_ok:
                playlist_words.append(word)

    return playlist_words

In [14]:
playlist_words = clean_playlist_names(track_id=track_id)

In [15]:
playlist_words[:20]

['rando',
 'solitude',
 'acoustic',
 'happy',
 'heart',
 'isis',
 'mya',
 'hayley',
 'chill',
 'chilly',
 'other',
 'breathe',
 'jens',
 'fallen',
 'run',
 'sad',
 'quiet',
 'pure',
 'love',
 'chill']

In [16]:
playlist_words[-20:]

['potential',
 'hmmmm',
 'alternative',
 'jared',
 'cry',
 'love',
 'together',
 'confidence',
 'let',
 'depressing',
 'easy',
 'grad',
 'love',
 'ballads',
 'out',
 'emotional',
 'slow',
 'sleep',
 'sad',
 'times']

In [17]:
playlist_words = clean_playlist_names(track_id='5TvE3pk05pyFIGdSY9j4DJ')

In [18]:
playlist_words[:20]

['rando',
 'solitude',
 'acoustic',
 'happy',
 'heart',
 'isis',
 'mya',
 'hayley',
 'chill',
 'chilly',
 'other',
 'breathe',
 'jens',
 'fallen',
 'run',
 'sad',
 'quiet',
 'pure',
 'love',
 'chill']

In [19]:
playlist_words[-20:]

['potential',
 'hmmmm',
 'alternative',
 'jared',
 'cry',
 'love',
 'together',
 'confidence',
 'let',
 'depressing',
 'easy',
 'grad',
 'love',
 'ballads',
 'out',
 'emotional',
 'slow',
 'sleep',
 'sad',
 'times']

In [20]:
len(playlist_words)

10097

In [21]:
len(set(playlist_words))

1984

In [22]:
# TODO: remove 's' from plural forms of words

## Get word bin counts for each track

In [23]:
# for each track_id
track_id = '3S0OXQeoh0w6AY8WQVckRW'

# get clean playlist words
playlist_words = clean_playlist_names(track_id=track_id)

# get unique playlist words
unique_words = set(playlist_words)

# create a dictionary with each unique word as a key with value = 0
word_bins = {}
for word in unique_words:
    word_bins[word] = 0

# go through the clean playlist words and tabulate using the dictionary
for word in playlist_words:
    word_bins[word] += 1

# convert into a list of words sorted by bin count
sorted_items = sorted(word_bins.items(), key=lambda x: x[1], reverse=True)


In [24]:
sorted_items[:20]

[('chill', 881),
 ('wedding', 682),
 ('love', 659),
 ('throwback', 656),
 ('good', 577),
 ('throwbacks', 341),
 ('happy', 327),
 ('summer', 285),
 ('pop', 266),
 ('car', 250),
 ('party', 246),
 ('beach', 207),
 ('old', 201),
 ('road', 188),
 ('oldies', 165),
 ('sing', 152),
 ('dinner', 146),
 ('trip', 145),
 ('shower', 145),
 ('mellow', 140)]

In [25]:
sorted_items[-20:]

[('woooo', 1),
 ('carr', 1),
 ('releases', 1),
 ('romanticonas', 1),
 ('julia', 1),
 ('jamie', 1),
 ('mind', 1),
 ('carrie', 1),
 ('try', 1),
 ('rhianna', 1),
 ('fluffy', 1),
 ('drugs', 1),
 ('zombie', 1),
 ('germany', 1),
 ('roots', 1),
 ('monster', 1),
 ('facu', 1),
 ('yosemite', 1),
 ('dutch', 1),
 ('pipe', 1)]

In [26]:
len(sorted_items)

2510

In [27]:
# Figure out how many unique words are there in all playlists for all tracks
global_words = []
start_time = datetime.datetime.now()
total_rows = len(df['track_id'].unique())
t = 0
p=0
for track_id in df['track_id'].unique():
    # get clean playlist words
    playlist_words = clean_playlist_names(track_id=track_id)

    # get unique playlist words
    unique_words = set(playlist_words)

    for w in unique_words:
        if w not in global_words:
            global_words.append(w)
    t += 1
    perc_complete = t*100/total_rows
    if perc_complete >= p:
        print(f'{perc_complete:.2f}%, {datetime.datetime.now()-start_time}')
        p += 1

0.02%, 0:00:00.025998
1.01%, 0:00:02.170972
2.01%, 0:00:02.812826
3.01%, 0:00:03.211339
4.01%, 0:00:07.701607
5.00%, 0:00:11.265274
6.02%, 0:00:14.290696
7.01%, 0:00:15.973813
8.01%, 0:00:16.422080
9.01%, 0:00:16.691721
10.00%, 0:00:16.822822
11.00%, 0:00:16.978660
12.02%, 0:00:17.178914
13.01%, 0:00:17.313546
14.01%, 0:00:18.723872
15.01%, 0:00:21.780191
16.00%, 0:00:23.116883
17.00%, 0:00:23.710405
18.02%, 0:00:23.866474
19.01%, 0:00:25.255869
20.01%, 0:00:26.218463
21.01%, 0:00:26.720802
22.00%, 0:00:26.973535
23.00%, 0:00:27.057361
24.02%, 0:00:27.139159
25.01%, 0:00:27.889536
26.01%, 0:00:28.069822
27.01%, 0:00:28.238911
28.00%, 0:00:28.617941
29.00%, 0:00:31.592401
30.01%, 0:00:38.270855
31.01%, 0:00:43.887788
32.01%, 0:00:44.266251
33.01%, 0:00:44.389590
34.00%, 0:00:44.786827
35.00%, 0:00:45.216602
36.01%, 0:00:46.841511
37.01%, 0:00:47.738942
38.01%, 0:00:48.991409
39.01%, 0:00:49.249579
40.00%, 0:00:49.542096
41.00%, 0:00:50.011321
42.01%, 0:00:53.798842
43.01%, 0:00:58.45225

In [28]:
len(global_words)

9276

In [29]:
data = {
    "global": {},
    "track_id":{}
}
for word in global_words:
    data["global"][word] = 0

start_time = datetime.datetime.now()
total_rows = len(df['track_id'].unique())
t = 0
p = 0

for track_id in df['track_id'].unique():
    # get word bin counts for each track_id
    # get clean playlist words
    playlist_words = clean_playlist_names(track_id=track_id)
    
    # get unique playlist words
    unique_words = set(playlist_words)

    # create a dictionary with each unique word as a key with value = 0
    data["track_id"][track_id] = {}
    for word in unique_words:
        data["track_id"][track_id][word] = 0
    
    # go through the clean playlist words and tabulate using the dictionary
    # tally global words as well
    for word in playlist_words:
        data["track_id"][track_id][word] += 1
        data["global"][word] += 1

    t += 1
    perc_complete = t*100/total_rows
    if perc_complete >= p:
        print(f'{perc_complete:.2f}%, {datetime.datetime.now()-start_time}')
        p += 1




0.02%, 0:00:00.012398
1.01%, 0:00:01.537522
2.01%, 0:00:01.959491
3.01%, 0:00:02.194529
4.01%, 0:00:04.988541
5.00%, 0:00:07.241069
6.02%, 0:00:09.058288
7.01%, 0:00:10.045802
8.01%, 0:00:10.303630
9.01%, 0:00:10.463037
10.00%, 0:00:10.544289
11.00%, 0:00:10.648051
12.02%, 0:00:10.774266
13.01%, 0:00:10.867698
14.01%, 0:00:11.775327
15.01%, 0:00:13.680056
16.00%, 0:00:14.495590
17.00%, 0:00:14.853865
18.02%, 0:00:14.946602
19.01%, 0:00:15.808863
20.01%, 0:00:16.352659
21.01%, 0:00:16.640161
22.00%, 0:00:16.786814
23.00%, 0:00:16.852774
24.02%, 0:00:16.909767
25.01%, 0:00:17.363418
26.01%, 0:00:17.466891
27.01%, 0:00:17.566813
28.00%, 0:00:17.799689
29.00%, 0:00:19.885303
30.01%, 0:00:24.361873
31.01%, 0:00:27.993468
32.01%, 0:00:28.211191
33.01%, 0:00:28.294918
34.00%, 0:00:28.545528
35.00%, 0:00:28.778635
36.01%, 0:00:29.816474
37.01%, 0:00:30.361245
38.01%, 0:00:31.250346
39.01%, 0:00:31.452864
40.00%, 0:00:31.596990
41.00%, 0:00:31.851783
42.01%, 0:00:34.184873
43.01%, 0:00:36.91844

In [30]:
# save to a single json
with open("word_count_data.json","w") as f:
    json.dump(data,f)