In [16]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
from tqdm import tqdm

In [17]:
pop = pd.read_csv('data/pop_music.csv').drop('Unnamed: 0', axis=1)
print(pop.shape)
pop.head()

(36451, 22)


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,track_id,artist_id,genre,is_pop
0,0.465,['Three Dog Night'],0.629,166707,0.376,0,0.0,7,0.235,-13.934,1,It Ain't Easy,26,1970-01-01,0.0337,76.848,0.347,1970,2hKxnhi2Eb3443AQbOnqNl,4FAEZeJcsYYBkNq2D3KGTV,"['album rock', 'art rock', 'blues rock', 'bril...",True
1,0.166,['Martha Reeves & The Vandellas'],0.417,176573,0.753,0,0.0,9,0.269,-10.204,0,I Should Be Proud,33,1970,0.126,178.408,0.794,1970,2nxSAQBvF6gDIwZmG6B9nO,1Pe5hlKMCTULjosqZ6KanP,"['brill building pop', 'classic girl group', '...",True
2,0.84,['Roberto Luti'],0.526,158387,0.501,0,0.0,0,0.139,-8.088,1,Celoso,39,1970-08-28,0.0367,113.546,0.762,1970,5FD1IZKwXVSL3zGwlNvLoF,33Cme6gMFRML19GrfipNMH,['pop romantico'],True
3,0.908,['Dionne Warwick'],0.691,179040,0.157,0,7e-06,1,0.0898,-13.711,1,Raindrops Keep Falling on My Head,36,1970,0.0475,89.844,0.378,1970,5o5fCEeIGC8igGmbXQZL8Q,2JSjCHK79gdaiPWdKiNUNp,"['adult standards', 'brill building pop', 'dis...",True
4,0.908,['Johnny Mathis'],0.609,149627,0.38,0,0.00152,5,0.0728,-10.527,1,Raindrops Keep Fallin' On My Head,31,1970-02-25,0.0343,109.514,0.612,1970,6BZaceyM6nWzq0AUFcaEX1,21LGsW7bziR4Ledx7WZ1Wf,"['adult standards', 'brill building pop', 'eas...",True


# Preprocessing #

In [18]:
# create new data frame with only necessary features for modeling
pop_model = pop.drop(['artists', 'name', 'popularity', 'release_date', 'year', 'artist_id', 'genre', 'is_pop'], axis=1)
pop_model.set_index('track_id', inplace=True)
print(pop_model.shape)
pop_model.head()

(36451, 13)


Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2hKxnhi2Eb3443AQbOnqNl,0.465,0.629,166707,0.376,0,0.0,7,0.235,-13.934,1,0.0337,76.848,0.347
2nxSAQBvF6gDIwZmG6B9nO,0.166,0.417,176573,0.753,0,0.0,9,0.269,-10.204,0,0.126,178.408,0.794
5FD1IZKwXVSL3zGwlNvLoF,0.84,0.526,158387,0.501,0,0.0,0,0.139,-8.088,1,0.0367,113.546,0.762
5o5fCEeIGC8igGmbXQZL8Q,0.908,0.691,179040,0.157,0,7e-06,1,0.0898,-13.711,1,0.0475,89.844,0.378
6BZaceyM6nWzq0AUFcaEX1,0.908,0.609,149627,0.38,0,0.00152,5,0.0728,-10.527,1,0.0343,109.514,0.612


In [19]:
# create list for different types of features for access later
num_features = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
                'speechiness', 'tempo', 'valence']
cat_features = ['explicit', 'mode', 'key']

### Using Gower Distance ###

Because the data contains a mixture of both numerical and categorical features, we will be using Gower Distance to calculate the similarity between two observations. Gower distance is on the range of 0 (identical) to 1 (maximally different). This will be very useful for including the categorical variables in the model.

In [20]:
import gower

In [6]:
# return the top 10 songs that are most similar to 'Hollaback Girl' by Gwen Stefani
sd = gower.gower_topn(pop_model.iloc[18544:18545,:], pop_model.iloc[:,:], n = 10)

# print the mean values for the features for these tracks
print(pop_model.iloc[sd['index']].describe().loc[['mean']])
# DataFrame that lists the names of these tracks
pop.iloc[sd['index']]

      acousticness  danceability  duration_ms  energy  explicit  \
mean       0.24402        0.8669     225546.3    0.82       1.0   

      instrumentalness   key  liveness  loudness  mode  speechiness     tempo  \
mean          0.000019  10.1   0.08532   -3.9557   0.0      0.14992  103.8573   

      valence  
mean   0.8087  


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,track_id,artist_id,genre,is_pop
18544,0.35,['Gwen Stefani'],0.926,199853,0.916,1,6e-06,10,0.0234,-2.221,0,Hollaback Girl,68,2004-11-23,0.0929,110.007,0.904,2004,0LzrhCZFXW94Y8nwtTuRlw,4yiQZ8tQPux8cPriYMWUFP,"['dance pop', 'europop', 'hip pop', 'pop', 'po...",True
18440,0.206,['Nelly'],0.956,228240,0.745,1,0.0,11,0.0615,-4.753,0,Hot In Herre,72,2002-06-25,0.12,107.075,0.912,2002,04KTF78FFg8sOHC1BADqbY,2gBjLmx6zQnFGQJCAQpRgw,"['dance pop', 'hip hop', 'pop', 'pop rap', 'ra...",True
15065,0.29,"['Clipse', 'Pharrell Williams']",0.705,261213,0.837,1,0.0,10,0.0571,-4.64,0,I'm Good (feat. Pharrell Williams),42,2009-12-08,0.0962,82.022,0.938,2009,4dna9pp213lAcnSrbj3xGn,2J257euzcjnDLipsyJH3F2,"['alternative hip hop', 'gangster rap', 'hardc...",True
25487,0.304,['A$AP Rocky'],0.897,187105,0.743,1,4.4e-05,10,0.137,-5.446,0,Babushka Boi,74,2019-08-28,0.222,134.979,0.905,2019,643PW82aBMUa1FiWi5VQY7,13ubrt8QOOCPljQ2FL1Kca,"['east coast hip hop', 'hip hop', 'pop rap', '...",True
19088,0.096,"['Travis Porter', 'Tyga']",0.905,211293,0.782,1,0.0,10,0.0623,-3.604,0,Ayy Ladies (feat. Tyga),71,2012-05-28,0.164,96.055,0.798,2012,4P6BuLsqtg5uISdE77ypI9,6z1cicLMt9XArxN10q7m8a,"['atl hip hop', 'dirty south rap', 'pop rap', ...",True
6281,0.251,"['Pharrell Williams', 'Kanye West']",0.845,236573,0.801,1,0.0,10,0.0951,-5.619,0,Number One,48,2006-01-01,0.129,107.012,0.666,2006,3r4U9UpHaFurMtq1oCZoAp,2RdwBSPQiwcmiDo9kixcl8,"['pop', 'pop rap']",True
24790,0.351,['DMX'],0.904,244013,0.728,1,0.000137,10,0.146,-4.773,0,How's It Goin' Down,54,2007-06-12,0.275,90.458,0.847,2007,4tkSJRlbhuVxYjvuIQaMcj,1HwM5zlC5qNWhJtM00yXzG,"['east coast hip hop', 'gangster rap', 'hardco...",True
4902,0.295,['Logic'],0.762,213013,0.919,1,0.0,9,0.126,-3.378,0,Now,52,2014-10-21,0.069,109.971,0.603,2014,4iEWnkDewJ72copeODWuOw,4xRYI6VqpkE3UwrDrAZL8L,"['conscious hip hop', 'dmv rap', 'hip hop', 'p...",True
28733,0.233,"['Lil Jon', 'LMFAO']",0.876,250747,0.984,1,0.0,10,0.0505,-2.018,0,Outta Your Mind,56,2010-01-01,0.274,99.976,0.642,2010,1Oenqmtbzt331Pgv0ODfS2,7sfl4Xt5KmfyDs2T3SVSMK,"['atl hip hop', 'crunk', 'dance pop', 'dirty s...",True
21536,0.0642,"['Calvin Harris', 'Pharrell Williams', 'Katy P...",0.893,223413,0.745,1,0.0,11,0.0943,-3.105,0,"Feels (feat. Pharrell Williams, Katy Perry & B...",77,2017-06-30,0.0571,101.018,0.872,2017,5bcTCxgc7xVfSaMV3RuVke,7CajNmpbOovFoOoasH2HaY,"['dance pop', 'edm', 'electro house', 'house',...",True


In [7]:
# return the top 10 songs that are most similar to 'Electric Feel' by MGMT
sd = gower.gower_topn(pop_model.iloc[18717:18718,:], pop_model.iloc[:,:], n = 10)

# print the mean values for the features for these tracks
print(pop_model.iloc[sd['index']].describe().loc[['mean']])
# DataFrame that lists the names of these tracks
pop.iloc[sd['index']]

      acousticness  danceability  duration_ms  energy  explicit  \
mean      0.032594        0.7479     242834.3    0.75       0.0   

      instrumentalness  key  liveness  loudness  mode  speechiness    tempo  \
mean           0.11365  0.9    0.3143   -6.6027   1.0      0.04879  101.484   

      valence  
mean   0.5964  


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,track_id,artist_id,genre,is_pop
18717,0.0714,['MGMT'],0.763,229640,0.807,0,0.28,1,0.348,-3.714,1,Electric Feel,78,2007-12-14,0.035,103.038,0.559,2007,3FtYbEfBqAlGO46NUDQSAt,0SwO7SWeDHJijQ3XNS7xEE,"['alternative dance', 'indie pop', 'indie rock...",True
27056,0.0154,['Ariana Grande'],0.725,197600,0.796,0,0.0,1,0.354,-3.684,1,Baby I,56,2013-01-01,0.0326,101.976,0.517,2013,6EIsMa5lbvljYxqCkjZVDi,66CXWjxzNUsdJxJ2JdwvnR,"['dance pop', 'pop', 'post-teen pop']",True
27224,0.0147,['Glass Animals'],0.699,281407,0.67,0,0.22,1,0.356,-6.219,1,Life Itself,63,2016-08-26,0.0347,111.988,0.556,2016,32zkKx35Et6A515oZKxDkD,4yvcSjfu4PC0CYQyLy4wSq,"['gauze pop', 'indietronica', 'modern rock', '...",True
21131,0.0308,"['Hot Chelle Rae', 'New Boyz']",0.742,188200,0.784,0,0.0,1,0.354,-5.458,1,I Like It Like That (feat. New Boyz),65,2011-11-25,0.0552,101.002,0.65,2011,6eFahAdQgABBj1XOM99cBG,6jTnHxhb6cDCaCu4rdvsQ0,"['dance pop', 'neo mellow', 'neon pop punk', '...",True
2853,0.0502,['Lady Gaga'],0.752,263360,0.84,0,1e-06,1,0.227,-5.047,1,Paper Gangsta,46,2008-01-01,0.0786,98.969,0.574,2008,0LtZYj18X8mpZ2M6A7nA8t,1HY2Jd0NmPuamShAr6KMms,"['dance pop', 'pop']",True
18209,0.0202,['Sugar Ray'],0.835,244640,0.648,0,0.225,1,0.308,-8.15,1,Fly,65,1997-06-20,0.0488,99.701,0.674,1997,3uPfVXcjnpOjyzI3jb3js4,4uN3DsfENc7dp0OLO0FEIb,"['alternative metal', 'alternative rock', 'fun...",True
26634,0.0235,['Sugar Ray'],0.836,244560,0.644,0,0.194,1,0.297,-8.11,1,Fly,52,2005-12-20,0.0522,99.721,0.652,2005,5fuGP7Dze9tkadS9WoPbar,4uN3DsfENc7dp0OLO0FEIb,"['alternative metal', 'alternative rock', 'fun...",True
24131,0.0814,['SNAP!'],0.757,341963,0.708,0,0.206,1,0.224,-15.232,1,"The Power - 12"" Version",52,1990,0.0455,108.928,0.554,1990,0c4s0e7Wi7EOzfjr7XNIwz,2FrKQPjJe4pVMZOgm0ESOx,"['diva house', 'eurodance', 'europop', 'german...",True
6909,0.000245,"['Ed Sheeran', 'Chris Stapleton', 'Bruno Mars']",0.631,209120,0.796,0,0.0115,1,0.38,-5.35,1,BLOW (with Chris Stapleton & Bruno Mars),69,2019-07-05,0.0401,92.469,0.596,2019,6EbduYTr62sSzvl36wWiFM,6eUKZXaKkcviH0Ku9w2n3V,"['pop', 'uk pop']",True
23123,0.0181,"['Justin Bieber', 'Jaden']",0.739,227853,0.807,0,0.0,0,0.295,-5.063,1,Never Say Never,69,2011-01-01,0.0652,97.048,0.632,2011,5GYbkDveRD2I8M5ZJ14hWn,1uNFoZAHBGtllmzznpCI3s,"['canadian pop', 'pop', 'post-teen pop']",True


After verifying these results by listening to the tracks, gower distance is a good measure of distance to use.

### Calcuate Matrix of Similarities Between All Tracks ###

In [8]:
# create a list of cat_features indicators
cat_ind = [True if x in cat_features else False for x in pop_model.columns]

gdf = gower.gower_matrix(pop_model, cat_features=cat_ind)

In [20]:
gdf_model = pd.DataFrame(gdf, index=pop_model.index)
print(gdf_model.shape)
gdf_model.head()

(36451, 36451)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,...,36426,36427,36428,36429,36430,36431,36432,36433,36434,36435,36436,36437,36438,36439,36440,36441,36442,36443,36444,36445,36446,36447,36448,36449,36450
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
2hKxnhi2Eb3443AQbOnqNl,0.0,0.318137,0.19026,0.154133,0.166769,0.181071,0.200302,0.182654,0.285563,0.178499,0.179381,0.099288,0.247378,0.211589,0.275625,0.105691,0.150809,0.245769,0.269703,0.133423,0.167526,0.140402,0.2207,0.204026,0.139491,...,0.219884,0.11993,0.145708,0.201619,0.172243,0.167616,0.364336,0.31634,0.330993,0.399585,0.184455,0.24842,0.364312,0.310168,0.317192,0.278278,0.157054,0.134117,0.417235,0.187655,0.254883,0.259855,0.087099,0.192624,0.144887
2nxSAQBvF6gDIwZmG6B9nO,0.318137,0.0,0.285791,0.376135,0.322953,0.262695,0.359025,0.297781,0.197825,0.213617,0.214328,0.322283,0.216653,0.280581,0.234321,0.268567,0.222398,0.296164,0.280324,0.267091,0.307812,0.276694,0.361751,0.230545,0.420548,...,0.231186,0.335621,0.297543,0.384401,0.268204,0.338057,0.255465,0.193491,0.291956,0.233176,0.249471,0.214301,0.270365,0.370385,0.356578,0.187212,0.305455,0.217161,0.258285,0.261984,0.29649,0.362888,0.295891,0.251158,0.29226
5FD1IZKwXVSL3zGwlNvLoF,0.19026,0.285791,0.0,0.177559,0.12184,0.15154,0.183514,0.050899,0.260644,0.214327,0.191458,0.122024,0.232669,0.245654,0.198692,0.2074,0.199922,0.272331,0.270165,0.150099,0.183621,0.191044,0.213177,0.072795,0.23835,...,0.20521,0.206402,0.187215,0.166264,0.186243,0.194684,0.336977,0.201502,0.337658,0.362718,0.207924,0.101529,0.387745,0.299675,0.337173,0.280025,0.114712,0.235124,0.384617,0.200402,0.376386,0.230142,0.166541,0.212572,0.113681
5o5fCEeIGC8igGmbXQZL8Q,0.154133,0.376135,0.177559,0.0,0.136289,0.217068,0.14548,0.157355,0.266697,0.23665,0.229464,0.152851,0.258652,0.222938,0.261645,0.237329,0.193632,0.215215,0.235955,0.190081,0.111255,0.185194,0.175165,0.236647,0.166942,...,0.259546,0.154454,0.172679,0.237689,0.130795,0.215111,0.389808,0.341288,0.362838,0.434126,0.225895,0.251881,0.413174,0.258192,0.317121,0.31238,0.197843,0.247111,0.378224,0.223334,0.366739,0.302505,0.195651,0.170708,0.194413
6BZaceyM6nWzq0AUFcaEX1,0.166769,0.322953,0.12184,0.136289,0.0,0.171277,0.089381,0.125636,0.249391,0.216901,0.21,0.107462,0.23727,0.229925,0.218925,0.199799,0.189995,0.252812,0.18697,0.168858,0.144752,0.107802,0.192878,0.181544,0.207853,...,0.222117,0.172853,0.167104,0.224464,0.180992,0.206575,0.347236,0.299211,0.339936,0.393707,0.20891,0.19579,0.406034,0.297222,0.322903,0.279194,0.18027,0.230234,0.415719,0.203587,0.3779,0.302954,0.178595,0.220434,0.191961


The result is a matrix of similarity values for all tracks in the original dataframe. This will be used to cluster the tracks. Clustering was done in another notebook on Google Cloud for the added memory and processing power. The modeling process can be found in the KMedoids_Models notebook.

# Modeling Analysis #

Below are the results of the best model from the KMedoids_Models notebook.

In [52]:
import pickle 
results = pickle.load(open("data/model_results.p", "rb"))
results

{'k': array([3, 1, 3, ..., 3, 2, 3]),
 'medoids': [33715, 31645, 25321, 16834],
 'v_ratio': 10483.229974915155,
 's_score': 0.22875765}