# Clean Combined Dataset

Remove as many "non-vibe" words from playlist names and get word occurence counts for each word

In [1]:
import pandas as pd
import os
import json
import copy
import datetime

import re


## Load combined dataset

In [2]:
dataset_dir = os.path.join('..','..','datasets','tracks_playlist_dataset')

df_file_path = os.path.join(dataset_dir,'tracks_playlists_df.pkl')

df = pd.read_pickle(df_file_path)
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,playlist_names
0,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,1,...,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic,"high, high, AUTUMN, Vampire Diaries, sleep, i ..."
1,1KHdq8NK9QxnGjdXb55NiG,Landon Pigg,The Boy Who Never,Falling in Love at a Coffee Shop,58,244986,False,0.489,0.561,4,...,1,0.0274,0.2,4.6e-05,0.179,0.238,83.457,3,acoustic,"Say You Won't Let Go, mellow, Dance, Chillin, ..."
2,2qLMf6TuEC3ruGJg4SMMN6,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,68,189613,False,0.625,0.414,0,...,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic,"Wedding, #boostyourrun, go to, Acoustic, 😍😍😍, ..."
3,3S0OXQeoh0w6AY8WQVckRW,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,75,242946,False,0.703,0.444,11,...,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic,"tb, Catchy Songs, #boostyourrun, go to, Atlas,..."
4,5TvE3pk05pyFIGdSY9j4DJ,A Great Big World;Christina Aguilera,Is There Anybody Out There? - Track by Track C...,Say Something,70,229400,False,0.407,0.147,2,...,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic,"~Rando~, go to, Solitude, Acoustic, happy, yo,..."


In [3]:
len(df)

7560

## Analyze raw words in playlist names

In [4]:
track_id = '3S0OXQeoh0w6AY8WQVckRW'
filter = df['track_id'] == track_id
row = df[filter].iloc[0]
playlist_names = row['playlist_names']
playlist_names = playlist_names.split(',')

In [5]:
playlist_names[0:10]

['tb',
 ' Catchy Songs',
 ' #boostyourrun',
 ' go to',
 ' Atlas',
 ' throwback',
 ' Acoustic',
 ' ((chris))',
 ' throw backs',
 ' Throwbacks ']

In [6]:
playlist_names[-10:]

[' Shower',
 ' throwback ',
 ' Stuff I like',
 ' Classics',
 ' good times',
 ' Throwback',
 ' Songs that never fail to make white people beyond turnt',
 ' kareoke',
 ' I love You',
 ' Lake']

In [15]:
track_id = '5TvE3pk05pyFIGdSY9j4DJ'
filter = df['track_id'] == track_id
row = df[filter].iloc[0]
playlist_names = row['playlist_names']
playlist_names = playlist_names.split(',')

In [16]:
playlist_names[0:10]

['~Rando~',
 ' go to',
 ' Solitude',
 ' Acoustic',
 ' happy',
 ' yo',
 ' my heart',
 ' Isis',
 ' Top Hits',
 ' Mya']

In [17]:
playlist_names[-10:]

[' Depressing songs',
 ' Easy Listening',
 ' GRAD',
 ' L.o.v.e',
 ' Ballads',
 ' Inside Out: So Emotional',
 ' Slow',
 ' feels',
 ' Sleep',
 ' sad times']

## Clean playlist names

In [7]:
# Articles
articles = [
    "a", "an", "the"
]

# Common Prepositions
# prepositions = [
#     "about", "above", "across", "after", "against", "along", "among",
#     "around", "at", "before", "behind", "below", "beneath", "beside",
#     "besides", "between", "beyond", "but", "by", "concerning", "considering",
#     "despite", "down", "during", "except", "excepting", "excluding",
#     "following", "for", "from", "in", "inside", "into", "like", "minus",
#     "near", "of", "off", "on", "onto", "opposite", "outside", "over", "past",
#     "per", "plus", "regarding", "round", "save", "since", "than", "through",
#     "to", "toward", "towards", "under", "underneath", "unlike", "until",
#     "up", "upon", "versus", "via", "with", "within", "without"
# ]

prepositions = [
    "about", "above", "across", "after", "against", "along", "among",
    "around", "at", "before", "behind", "below", "beneath", "beside",
    "besides", "between", "beyond", "but", "by", "concerning", "considering",
    "despite", "down", "during", "except", "excepting", "excluding",
    "following", "for", "from", "in", "inside", "into", "like", "minus",
    "near", "of", "off", "on", "onto", "outside", "over",
    "per", "plus", "regarding", "round", "since", "than", "through",
    "to", "versus", "via", "with", "within", "without"
]

# Pronouns (personal, possessive, reflexive, demonstrative, relative, interrogative, indefinite)
pronouns = [
    # Personal
    "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
    # Possessive
    "my", "mine", "your", "yours", "his", "her", "hers", "its", "our", "ours", "their", "theirs",
    # Reflexive
    "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves",
    # Demonstrative
    "this", "that", "these", "those",
    # Relative
    "who", "whom", "whose", "which", "that",
    # Interrogative
    "what", "which", "who", "whom", "whose",
    # Indefinite
    "anybody", "anyone", "anything", "each", "either", "everybody", "everyone", "everything",
    "neither", "nobody", "no one", "nothing", "one", "somebody", "someone", "something",
    "both", "few", "many", "several", "all", "any", "most", "none", "some"
]


In [8]:
# remove synonyms for music/songs
# music_stopwords = [
#     # General music terms
#     "music", "song", "songs", "track", "tracks", "tune", "tunes",
#     "melody", "melodies", "rhythm", "harmony", "lyrics",
    
#     # Album / playlist words
#     "playlist", "mix", "compilation", "collection", "set", "jam", "jams",
#     "record", "records", "album", "albums", "single", "singles", "ep", "lp",
    
#     # Performance terms
#     "band", "bands", "group", "groups", "orchestra", "choir", "ensemble",
#     "performance", "performances", "concert", "gig", "show", "live",
    
#     # Listening context
#     "listen", "listening", "play", "played", "plays", "playing",
#     "sound", "sounds", "audio",
    
#     # Time/context in music
#     "remix", "remixes", "cover", "covers", "version", "versions",
#     "original", "edit", "edits", "demo", "demos",
    
#     # Streaming platform common words
#     "radio", "station", "stations", "session", "sessions",
    
#     # Music role terms
#     "dj", "producer", "production", "artist", "artists", "musician", "musicians",
    
#     # Genre meta-words (not actual genres)
#     "hit", "hits", "chart", "charts", "top", "best", "greatest", "favorites", "favourite",
#     "new", "latest", "classic", "classics", "oldies"
# ]

music_stopwords = [
    # General music terms
    "music", "song", "songs", "track", "tracks", "tune", "tunes",
    "melody", "melodies", "rhythm", "harmony", "lyrics",
    
    # Album / playlist words
    "playlist", "mix", "compilation", "collection", "set", "jam", "jams",
    "record", "records", "album", "albums", "single", "singles", "ep", "lp",
    
    # Performance terms
    "band", "bands", "group", "groups", "orchestra", "choir", "ensemble",
    "performance", "performances", "concert", "gig", "show", "live",
    
    # Listening context
    "listen", "listening", "play", "played", "plays", "playing",
    "sound", "sounds", "audio",
    
    # Time/context in music
    "remix", "remixes", "cover", "covers", "version", "versions",
    "original", "edit", "edits", "demo", "demos",
    
    # Streaming platform common words
    "radio", "station", "stations", "session", "sessions",
    
    # Music role terms
    "dj", "producer", "production", "artist", "artists", "musician", "musicians",
    
    # Genre meta-words (not actual genres)
    "hit", "hits", "chart", "charts", "top", "best", "greatest", "favorites", "favourite",
    "new"
]

In [9]:
def clean_playlist_names(track_id):
    # find track_id in DataFrame
    filter = df['track_id'] == track_id
    row = df[filter].iloc[0]

    # get playlist names 
    playlist_names = row['playlist_names']

    # convert to a list
    playlist_names = playlist_names.split(',')

    # separate into distinct words
    playlist_words = []
    for name in playlist_names:
        # convert to lower case
        n = name.lower()

        # remove symbols and emojis
        n = re.sub(r"[^\w\s]", "", n, flags=re.UNICODE)

        # split based on spaces
        n = n.split(' ')
        
        for word in n:
            # exclude articles, prepositions, pronouns
            exc0 = word == ''
            exc1 = word in articles
            exc2 = word in prepositions
            exc3 = word in pronouns

            # exclude music stop words
            exc4 = word in music_stopwords

            word_ok = not (exc0 or exc1 or exc2 or exc3)
            
            if word_ok:
                playlist_words.append(word)

    return playlist_words

In [10]:
playlist_words = clean_playlist_names(track_id=track_id)

In [11]:
playlist_words[:20]

['tb',
 'catchy',
 'songs',
 'boostyourrun',
 'go',
 'atlas',
 'throwback',
 'acoustic',
 'chris',
 'throw',
 'backs',
 'throwbacks',
 'lm',
 'mb',
 'mya',
 'main',
 '2000s',
 'hits',
 'roadtrip',
 'roadtrip']

In [12]:
playlist_words[-20:]

['good',
 'ukulele',
 'roadtrip',
 'shower',
 'throwback',
 'stuff',
 'classics',
 'good',
 'times',
 'throwback',
 'songs',
 'never',
 'fail',
 'make',
 'white',
 'people',
 'turnt',
 'kareoke',
 'love',
 'lake']

In [18]:
playlist_words = clean_playlist_names(track_id='5TvE3pk05pyFIGdSY9j4DJ')

In [19]:
playlist_words[:20]

['rando',
 'go',
 'solitude',
 'acoustic',
 'happy',
 'yo',
 'heart',
 'isis',
 'top',
 'hits',
 'mya',
 'hayley',
 'chill',
 'playlist',
 'chilly',
 'other',
 'breathe',
 'jens',
 'fallen',
 'run']

In [20]:
playlist_words[-20:]

['together',
 'confidence',
 'ds',
 'let',
 'go',
 'depressing',
 'songs',
 'easy',
 'listening',
 'grad',
 'love',
 'ballads',
 'out',
 'so',
 'emotional',
 'slow',
 'feels',
 'sleep',
 'sad',
 'times']

In [21]:
len(playlist_words)

14333

In [22]:
len(set(playlist_words))

2246

In [13]:
# TODO: remove 's' from plural forms of words

## Get word bin counts for each track

In [None]:
# for each track_id

# get clean playlist words

# get unique playlist words

# create a dictionary with each unique word as a key with value = 0

# go through the clean playlist words and tabulate using the dictionary

# convert into a list of words sorted by bin count