# Tableau Datasets

In [1]:
import os
import json
import random
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [2]:
dataset_dir = os.path.join('..','..','datasets','tracks_playlist_dataset')

word_bin_filepath = os.path.join(dataset_dir,'word_count_df.pkl')
word_bin_df = pd.read_pickle(word_bin_filepath)

word_bin_df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,issa,justin,ish,rockalt,tropical,depressing,lounge,david,wow,jukebox
0,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,1,...,0,0,0,0,0,0,0,0,0,0
1,1KHdq8NK9QxnGjdXb55NiG,Landon Pigg,The Boy Who Never,Falling in Love at a Coffee Shop,58,244986,False,0.489,0.561,1,...,0,0,0,0,0,0,1,0,0,0
2,2qLMf6TuEC3ruGJg4SMMN6,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,68,189613,False,0.625,0.414,4,...,1,0,0,1,0,0,2,1,3,1
3,3S0OXQeoh0w6AY8WQVckRW,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,75,242946,False,0.703,1.0,14,...,1,2,0,0,3,3,2,0,3,1
4,5TvE3pk05pyFIGdSY9j4DJ,A Great Big World;Christina Aguilera,Is There Anybody Out There? - Track by Track C...,Say Something,70,229400,False,0.407,1.0,6,...,1,1,2,1,0,32,1,0,0,0


## Word-TrackID Table

Dataset 1: Rows of different words, columns of track_ids, each cell value = word frequency count for each track 

In [3]:
word_list = list(word_bin_df.columns[20:])
cols = ['track_id'] + word_list
# cols

In [25]:
word_list[:5]

['rock', 'oldies', 'classic', 'party', 'summer']

In [4]:
sub_df = word_bin_df[cols]
sub_df.head()

Unnamed: 0,track_id,rock,oldies,classic,party,summer,good,chill,old,classics,...,issa,justin,ish,rockalt,tropical,depressing,lounge,david,wow,jukebox
0,7k9GuJYLp2AzqokyEdwEw2,0,0,0,0,0,8,14,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1KHdq8NK9QxnGjdXb55NiG,7,2,1,5,12,36,142,3,1,...,0,0,0,0,0,0,1,0,0,0
2,2qLMf6TuEC3ruGJg4SMMN6,23,37,3,76,114,211,481,45,15,...,1,0,0,1,0,0,2,1,3,1
3,3S0OXQeoh0w6AY8WQVckRW,78,165,12,246,285,577,881,201,58,...,1,2,0,0,3,3,2,0,3,1
4,5TvE3pk05pyFIGdSY9j4DJ,32,23,6,40,49,152,453,44,14,...,1,1,2,1,0,32,1,0,0,0


In [5]:
# df_t = sub_df.set_index('track_id').T.reset_index().rename(columns={'index': 'playlist_word'})

df_t = sub_df.set_index('track_id').T
df_t.columns.name = "playlist_word"
df_t.reset_index()
df_t.head()


playlist_word,7k9GuJYLp2AzqokyEdwEw2,1KHdq8NK9QxnGjdXb55NiG,2qLMf6TuEC3ruGJg4SMMN6,3S0OXQeoh0w6AY8WQVckRW,5TvE3pk05pyFIGdSY9j4DJ,2DHDuADAHoUW6n0z80RLQF,5JDcQAztvZTIkrWoZihgvC,6xJOhSm4SvZwzy3uhWz26O,7lLKxcNeJtDTWVRKHovLEC,4oa14QBfWRDfJy2agySy0L,...,4fxF8ljwryMZX5c9EKrLFE,7ojJ4XvqBhBcteM0zjMebT,3AEZUABDXNtecAOSC1qTfo,5MIrspiQcYqNVB5XARdf5l,3DTHLA4DaMpgHrK1fRP2bF,3hPbdPIXZPpyywSxnKNtYh,5VsfM32WcLbMlznpWpprkH,5NetSTs4dgiR6GLrZOQcwJ,1Rvl8qsKJurfFTyWLBI9ib,4HnMrFXdsJYVuXRmPCevxH
rock,0,7,23,78,32,4,18,1,0,8,...,6,13,2,0,0,0,0,0,14,0
oldies,0,2,37,165,23,0,22,0,0,9,...,48,43,1,0,0,0,0,1,78,0
classic,0,1,3,12,6,0,1,0,0,1,...,9,9,0,0,0,0,0,0,17,0
party,0,5,76,246,40,4,15,0,2,14,...,137,17,85,0,0,0,0,60,23,0
summer,0,12,114,285,49,33,47,5,4,21,...,78,28,45,0,0,1,0,64,51,0


In [6]:
df_t.to_csv('word_trackID.csv')

## Word-Artist Table

Dataset 2: Rows of different words, columns of artists

In [7]:
total_rows = len(word_bin_df)
artists = [x.split(';') for x in word_bin_df['artists']]
artists[:5]

[['Ross Copperman'],
 ['Landon Pigg'],
 ['Jason Mraz', 'Colbie Caillat'],
 ['Jason Mraz'],
 ['A Great Big World', 'Christina Aguilera']]

In [11]:
n_mult_artists = sum([1 if len(x) > 1 else 0 for x in artists])
n_mult_artists/total_rows

0.170018281535649

In [16]:
# get word values as a list
word_features = [int(x) for x in word_bin_df.iloc[0][word_list].values]

[0,
 0,
 0,
 0,
 0,
 8,
 14,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 8,
 0,
 4,
 2,
 2,
 3,
 3,
 0,
 2,
 7,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 24,
 2,
 0,
 0,
 0,
 4,
 1,
 0,
 2,
 11,
 6,
 0,
 0,
 0,
 1,
 0,
 0,
 4,
 0,
 1,
 0,
 0,
 0,
 3,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 

In [19]:
unique_artists = []
for a in artists:
    for x in a:
        unique_artists.append(x)
unique_artists = set(unique_artists)
len(unique_artists)

3358

In [21]:
# new dictionary with keys for each individual artist
artist_data = {}

for a in unique_artists:
    artist_data[a] = []

# for each row in word_bin_df
for i,row in word_bin_df.iterrows():
    # get the artists
    artists_list = row['artists']
    artists_list = artists_list.split(';')

    # get word values as a list
    word_features = [int(x) for x in row[word_list].values]
    # add word features vector for each artist to the dictionary
    for a in artists_list:
        artist_data[a].append(word_features)

In [23]:
np.array(artist_data['Jason Mraz']).shape

(7, 657)

In [24]:
np.sum(np.array(artist_data['Jason Mraz']), axis=0).shape

(657,)

In [26]:
np.sum(np.array(artist_data['Jason Mraz']), axis=0)

array([ 127,  224,   20,  385,  537,  951, 1722,  273,   83,   22,   26,
        810,   33,  146,  322,  231,  386,    4, 1449,  247,  112,  197,
        395,  140,  174, 1345,  411,   65,   95,    0,   12,  214,  539,
          9,   11,  171,   34,   18,   30,   93,   94,  226,  152,  175,
          0,   74,  137,   60,  301,    1,   34,  237,  101,   77,   51,
        233,  290,   36,   16,   94,   75,  197,  123,  290,    4,  170,
          5,   47,    9,  289,    2,   69,  275,   72,   93,  102,    6,
         52,   73,  220,  116,  196,  217,   62,    9,   30,   83,   19,
         20,   54,   33,   52,   52,   67,    8,   19,   12,  180,   14,
        292,   78,    3,  253,   24,  181,   23,   42,   87,   41,    0,
         57,    0,   40,   32,    7,   19,   33,    2,   27,   44,   47,
          7,    5,   32,   22,   32,   56,    0,   41,    1,   26,   17,
         31,  200,   59,   38,    7,   33,   81,    1,   11,   22,    3,
         26,  110,   38,   47,    0,    7,   31,   

In [27]:
# new dataframe dictionary
artist_df_data = {
    'artist':[x for x in artist_data.keys()]
}
for word in word_list:
    artist_df_data[word] = []

for artist in artist_data.keys():
    word_vals = np.sum(np.array(artist_data[artist]), axis=0)
    for i,word in enumerate(word_list):
        artist_df_data[word].append(word_vals[i])

artist_df = pd.DataFrame(data=artist_df_data)

In [28]:
artist_df.head()

Unnamed: 0,artist,rock,oldies,classic,party,summer,good,chill,old,classics,...,issa,justin,ish,rockalt,tropical,depressing,lounge,david,wow,jukebox
0,Willyecho,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Kansas,4703,1897,2908,221,186,414,203,817,1253,...,0,1,1,5,0,3,3,4,3,5
2,George Davidson,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,He Is Legend,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Steve Vai,39,0,3,0,1,1,5,3,3,...,0,0,0,0,0,0,0,0,0,0


In [29]:
filter = artist_df['artist'] == 'Jason Mraz'
artist_df[filter]

Unnamed: 0,artist,rock,oldies,classic,party,summer,good,chill,old,classics,...,issa,justin,ish,rockalt,tropical,depressing,lounge,david,wow,jukebox
3020,Jason Mraz,127,224,20,385,537,951,1722,273,83,...,2,2,2,1,3,3,5,1,6,3


In [30]:
keyword = 'tropical'
filter = artist_df[keyword] > 0
filter_df = artist_df[filter][['artist',keyword]]
filter_df.sort_values(by=keyword,ascending=False).head()

Unnamed: 0,artist,tropical
3315,Kygo,264
2195,Robin Schulz,130
1868,Conrad Sewell,91
1273,Parson James,84
2755,Thomas Jack,60


In [31]:
keyword = 'tropical'
filter = word_bin_df[keyword] > 0
filter_df = word_bin_df[filter][['track_name', 'artists', 'album_name',keyword]]
filter_df.sort_values(by=keyword,ascending=False).head()

Unnamed: 0,track_name,artists,album_name,tropical
2504,Firestone,Kygo;Conrad Sewell,Cloud Nine,91
2527,Stole the Show,Kygo;Parson James,Cloud Nine,84
2046,Rivers,Thomas Jack,Rivers,57
2514,The Ocean (feat. Shy Martin),Mike Perry;shy martin,The Ocean (feat. Shy Martin),47
3057,Sun Goes Down (feat. Jasmine Thompson) - Radio...,Robin Schulz;Jasmine Thompson,Prayer,44


In [32]:
artist_df.to_csv('artist_features.csv')

## Track-Features Table

Dataset 3: Rows of track_ids, columns of artist numerical features

In [63]:
track_df = word_bin_df[word_bin_df.columns[:20]]
track_df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,1,-6.77,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic
1,1KHdq8NK9QxnGjdXb55NiG,Landon Pigg,The Boy Who Never,Falling in Love at a Coffee Shop,58,244986,False,0.489,0.561,1,-7.933,1,0.0274,0.2,4.6e-05,0.179,0.238,83.457,3,acoustic
2,2qLMf6TuEC3ruGJg4SMMN6,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,68,189613,False,0.625,0.414,4,-8.7,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic
3,3S0OXQeoh0w6AY8WQVckRW,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,75,242946,False,0.703,1.0,14,-9.331,3,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic
4,5TvE3pk05pyFIGdSY9j4DJ,A Great Big World;Christina Aguilera,Is There Anybody Out There? - Track by Track C...,Say Something,70,229400,False,0.407,1.0,6,-8.822,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic


In [64]:
track_df.to_csv('track_features.csv')