# kMeans in DataBricks
## First, a test with example code
using procedure from here: https://stackoverflow.com/questions/47585723/kmeans-clustering-in-pyspark  
note also DataBricks ml-docs at http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.clustering.KMeans

In [0]:
df = spark.createDataFrame([[0, 33.3, -17.5],
                            [1, 40.4, -20.5],
                            [2, 28.0, -23.9],
                            [3, 29.5, -19.0],
                            [4, 32.8, -18.8]
                           ], ['other', 'lat', 'long'])
df.show()

In [0]:
from pyspark.ml.feature import VectorAssembler

new_df = VectorAssembler(inputCols=['lat', 'long'], outputCol='features').transform(df)
new_df.display()

other,lat,long,features
0,33.3,-17.5,"List(1, 2, List(), List(33.3, -17.5))"
1,40.4,-20.5,"List(1, 2, List(), List(40.4, -20.5))"
2,28.0,-23.9,"List(1, 2, List(), List(28.0, -23.9))"
3,29.5,-19.0,"List(1, 2, List(), List(29.5, -19.0))"
4,32.8,-18.8,"List(1, 2, List(), List(32.8, -18.8))"


In [0]:
from pyspark.ml.clustering import KMeans

# Note we say how many clusters we want here: k=2
model = KMeans(k=2, seed=1).fit(new_df)

In [0]:
transformed = model.transform(new_df)
transformed.display()

other,lat,long,features,prediction
0,33.3,-17.5,"List(1, 2, List(), List(33.3, -17.5))",0
1,40.4,-20.5,"List(1, 2, List(), List(40.4, -20.5))",0
2,28.0,-23.9,"List(1, 2, List(), List(28.0, -23.9))",1
3,29.5,-19.0,"List(1, 2, List(), List(29.5, -19.0))",1
4,32.8,-18.8,"List(1, 2, List(), List(32.8, -18.8))",0


## Now with our data!

In [0]:
# Load data from our AWS RDS database

for line in map(lambda line: line.split('=', 2), sc.textFile('/FileStore/rds.txt').collect()):
  (len(line) > 1) and globals().update({line[0]: line[1]})

df_joined = spark.read.jdbc(table='joined',
  url=f'jdbc:mysql://{rds_host}/{rds_db}?user={rds_user}&password={rds_pwd}')

print(end=f'\nLoaded df_joined ({df_joined.count():,} rows): '); df_joined.printSchema()

In [0]:
# Show me some data just for info

df_joined.display()

track_id,artist_name,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
2b8fOow8UzyDFAE27YhOZM,Maroon 5,Memories,100,0.764,0.32,11,-7.209,1,0.0546,0.8370000000000001,0.0,0.0822,0.575,91.019,189486,4
21jGcNKet2qwijlDFuPiPb,Post Malone,Circles,99,0.695,0.762,0,-3.497,1,0.0395,0.192,0.00244,0.0863,0.5529999999999999,120.042,215280,4
3eekarcy7kvN4yt5ZFzltW,Travis Scott,HIGHEST IN THE ROOM,98,0.598,0.427,7,-8.764,0,0.0317,0.0546,5.83e-06,0.21,0.0605,76.469,175721,4
3ZCTVFBt2Brf31RLEnCkWJ,Billie Eilish,everything i wanted,98,0.7040000000000001,0.225,6,-14.454,0,0.0994,0.902,0.657,0.106,0.243,120.006,245426,4
2XU0oxnq2qxCpomAAuJY8K,Tones and I,Dance Monkey,98,0.8240000000000001,0.588,6,-6.4,0,0.0924,0.6920000000000001,0.000104,0.149,0.513,98.027,209438,4
1HfMVBKM75vxSfsQ5VefZ5,Selena Gomez,Lose You To Love Me,98,0.505,0.34,4,-9.005,1,0.0438,0.5760000000000001,0.0,0.21,0.0916,101.993,206459,4
7aiClxsDWFRQ0Kzk5KI5ku,blackbear,hot girl bummer,97,0.778,0.5589999999999999,6,-7.109,0,0.0776,0.128,0.0,0.3989999999999999,0.6779999999999999,129.989,185093,1
6cy3ki60hLwimwIje7tALf,The Black Eyed Peas,RITMO (Bad Boys For Life),97,0.721,0.716,10,-7.037000000000001,0,0.0657,0.0334,0.00084,0.237,0.667,104.994,221714,4
6WrI0LAC5M1Rw2MnX2ZvEg,Dua Lipa,Don't Start Now,97,0.794,0.7929999999999999,11,-4.521,0,0.0842,0.0125,0.0,0.0952,0.677,123.941,183290,4
696DnlkuDOXcMAnKlTgXXK,Arizona Zervas,ROXANNE,97,0.621,0.601,6,-5.6160000000000005,0,0.148,0.0522,0.0,0.46,0.457,116.735,163636,5


In [0]:
# And some statistics?

df_joined.summary().toPandas().set_index('summary').rename_axis('column').T.astype({'count':int})

column,count,mean,stddev,min,25%,50%,75%,max
track_id,15626,,,001UkMQHw4zXfFNdKpwXAF,,,,7zzm71Fx6YaLDv6zkEBP6S
artist_name,15626,2371.923076923077,1251.4178932673703,$NOT,3030.0,3030.0,3030.0,須田景凪
track_name,15626,inf,,!,21.0,239.0,1997.0,달라달라 (DALLA DALLA)
popularity,15626,49.09823371304237,24.273115612133985,0,43.0,58.0,65.0,100
danceability,15626,0.5945621720209918,0.1959560776402525,0.0,0.4579999999999999,0.622,0.745,0.983
energy,15626,0.5345365907461905,0.2719488952746124,2.02E-5,0.337,0.588,0.75,1.0
key,15626,5.183348265710994,3.59410813165091,0,2.0,5.0,8.0,11
loudness,15626,-10.063024638423146,7.372410034467787,-45.136,-11.838,-7.166,-5.222,2.036
mode,15626,0.6332394726737489,0.4819357911020608,0,0.0,1.0,1.0,1
speechiness,15626,0.1097870664277483,0.1182884020001622,0.0,0.0399,0.0562,0.125,0.951


In [0]:
# Assemble a list of feature columns

feature_columns = [_ for _ in df_joined.columns if _ not in ['track_id', 'artist_name', 'track_name']]

print(feature_columns)

In [0]:
# Compute the vector column need for pySpark-kMeans

from pyspark.ml.feature import VectorAssembler
if ('features' in df_joined.columns): df_joined = df_joined.drop('features')

df_joined = VectorAssembler(inputCols=feature_columns, outputCol='features').transform(df_joined)

df_joined.display()

track_id,artist_name,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,features
2b8fOow8UzyDFAE27YhOZM,Maroon 5,Memories,100,0.764,0.32,11,-7.209,1,0.0546,0.8370000000000001,0.0,0.0822,0.575,91.019,189486,4,"List(1, 14, List(), List(100.0, 0.764, 0.32, 11.0, -7.209, 1.0, 0.0546, 0.8370000000000001, 0.0, 0.0822, 0.575, 91.01899999999999, 189486.0, 4.0))"
21jGcNKet2qwijlDFuPiPb,Post Malone,Circles,99,0.695,0.762,0,-3.497,1,0.0395,0.192,0.00244,0.0863,0.5529999999999999,120.042,215280,4,"List(1, 14, List(), List(99.0, 0.695, 0.762, 0.0, -3.497, 1.0, 0.0395, 0.192, 0.00244, 0.0863, 0.5529999999999999, 120.042, 215280.0, 4.0))"
3eekarcy7kvN4yt5ZFzltW,Travis Scott,HIGHEST IN THE ROOM,98,0.598,0.427,7,-8.764,0,0.0317,0.0546,5.83e-06,0.21,0.0605,76.469,175721,4,"List(1, 14, List(), List(98.0, 0.598, 0.42700000000000005, 7.0, -8.764, 0.0, 0.0317, 0.0546, 5.83E-6, 0.21, 0.0605, 76.469, 175721.0, 4.0))"
3ZCTVFBt2Brf31RLEnCkWJ,Billie Eilish,everything i wanted,98,0.7040000000000001,0.225,6,-14.454,0,0.0994,0.902,0.657,0.106,0.243,120.006,245426,4,"List(1, 14, List(), List(98.0, 0.7040000000000001, 0.225, 6.0, -14.454, 0.0, 0.0994, 0.902, 0.657, 0.106, 0.243, 120.006, 245426.0, 4.0))"
2XU0oxnq2qxCpomAAuJY8K,Tones and I,Dance Monkey,98,0.8240000000000001,0.588,6,-6.4,0,0.0924,0.6920000000000001,0.000104,0.149,0.513,98.027,209438,4,"List(1, 14, List(), List(98.0, 0.8240000000000001, 0.588, 6.0, -6.4, 0.0, 0.0924, 0.6920000000000001, 1.0400000000000001E-4, 0.149, 0.513, 98.027, 209438.0, 4.0))"
1HfMVBKM75vxSfsQ5VefZ5,Selena Gomez,Lose You To Love Me,98,0.505,0.34,4,-9.005,1,0.0438,0.5760000000000001,0.0,0.21,0.0916,101.993,206459,4,"List(1, 14, List(), List(98.0, 0.505, 0.34, 4.0, -9.005, 1.0, 0.0438, 0.5760000000000001, 0.0, 0.21, 0.0916, 101.993, 206459.0, 4.0))"
7aiClxsDWFRQ0Kzk5KI5ku,blackbear,hot girl bummer,97,0.778,0.5589999999999999,6,-7.109,0,0.0776,0.128,0.0,0.3989999999999999,0.6779999999999999,129.989,185093,1,"List(1, 14, List(), List(97.0, 0.778, 0.5589999999999999, 6.0, -7.109, 0.0, 0.0776, 0.128, 0.0, 0.39899999999999997, 0.6779999999999999, 129.989, 185093.0, 1.0))"
6cy3ki60hLwimwIje7tALf,The Black Eyed Peas,RITMO (Bad Boys For Life),97,0.721,0.716,10,-7.037000000000001,0,0.0657,0.0334,0.00084,0.237,0.667,104.994,221714,4,"List(1, 14, List(), List(97.0, 0.721, 0.716, 10.0, -7.037000000000001, 0.0, 0.0657, 0.0334, 8.4E-4, 0.237, 0.667, 104.994, 221714.0, 4.0))"
6WrI0LAC5M1Rw2MnX2ZvEg,Dua Lipa,Don't Start Now,97,0.794,0.7929999999999999,11,-4.521,0,0.0842,0.0125,0.0,0.0952,0.677,123.941,183290,4,"List(1, 14, List(), List(97.0, 0.794, 0.7929999999999999, 11.0, -4.521, 0.0, 0.0842, 0.0125, 0.0, 0.0952, 0.677, 123.941, 183290.0, 4.0))"
696DnlkuDOXcMAnKlTgXXK,Arizona Zervas,ROXANNE,97,0.621,0.601,6,-5.6160000000000005,0,0.148,0.0522,0.0,0.46,0.457,116.735,163636,5,"List(1, 14, List(), List(97.0, 0.621, 0.601, 6.0, -5.6160000000000005, 0.0, 0.14800000000000002, 0.0522, 0.0, 0.46, 0.457, 116.735, 163636.0, 5.0))"


In [0]:
# Scale the features to value range 0..1

from pyspark.ml.feature import MinMaxScaler
if ('scaledFeatures' in df_joined.columns): df_joined = df_joined.drop('scaledFeatures')

df_joined = MinMaxScaler(inputCol='features', outputCol='scaledFeatures') \
  .fit(df_joined).transform(df_joined)

df_joined.display()

track_id,artist_name,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,features,scaledFeatures
2b8fOow8UzyDFAE27YhOZM,Maroon 5,Memories,100,0.764,0.32,11,-7.209,1,0.0546,0.8370000000000001,0.0,0.0822,0.575,91.019,189486,4,"List(1, 14, List(), List(100.0, 0.764, 0.32, 11.0, -7.209, 1.0, 0.0546, 0.8370000000000001, 0.0, 0.0822, 0.575, 91.01899999999999, 189486.0, 4.0))","List(1, 14, List(), List(1.0, 0.7772126144455749, 0.3199862637225272, 1.0, 0.8040150936996524, 1.0, 0.057413249211356474, 0.8403614457831327, 0.0, 0.0822, 0.5813953488372091, 0.3856769972626886, 0.11948836433408548, 0.8))"
21jGcNKet2qwijlDFuPiPb,Post Malone,Circles,99,0.695,0.762,0,-3.497,1,0.0395,0.192,0.00244,0.0863,0.5529999999999999,120.042,215280,4,"List(1, 14, List(), List(99.0, 0.695, 0.762, 0.0, -3.497, 1.0, 0.0395, 0.192, 0.00244, 0.0863, 0.5529999999999999, 120.042, 215280.0, 4.0))","List(1, 14, List(), List(0.99, 0.7070193285859614, 0.7619951923028845, 0.0, 0.8827058424489104, 1.0, 0.041535226077812834, 0.1927710843373494, 0.0024448897795591178, 0.0863, 0.5591506572295246, 0.5086568530241782, 0.13690893876594473, 0.8))"
3eekarcy7kvN4yt5ZFzltW,Travis Scott,HIGHEST IN THE ROOM,98,0.598,0.427,7,-8.764,0,0.0317,0.0546,5.83e-06,0.21,0.0605,76.469,175721,4,"List(1, 14, List(), List(98.0, 0.598, 0.42700000000000005, 7.0, -8.764, 0.0, 0.0317, 0.0546, 5.83E-6, 0.21, 0.0605, 76.469, 175721.0, 4.0))","List(1, 14, List(), List(0.98, 0.6083418107833164, 0.4269884251661884, 0.6363636363636364, 0.7710506232510811, 0.0, 0.03333333333333333, 0.05481927710843374, 5.8416833667334665E-6, 0.21, 0.06117290192113244, 0.3240239324062068, 0.11019185324412105, 0.8))"
3ZCTVFBt2Brf31RLEnCkWJ,Billie Eilish,everything i wanted,98,0.7040000000000001,0.225,6,-14.454,0,0.0994,0.902,0.657,0.106,0.243,120.006,245426,4,"List(1, 14, List(), List(98.0, 0.7040000000000001, 0.225, 6.0, -14.454, 0.0, 0.0994, 0.902, 0.657, 0.106, 0.243, 120.006, 245426.0, 4.0))","List(1, 14, List(), List(0.98, 0.7161749745676502, 0.22498434468376263, 0.5454545454545454, 0.650428220130586, 0.0, 0.10452155625657204, 0.9056224899598394, 0.658316633266533, 0.106, 0.24570273003033363, 0.5085043093585538, 0.1572687370454992, 0.8))"
2XU0oxnq2qxCpomAAuJY8K,Tones and I,Dance Monkey,98,0.8240000000000001,0.588,6,-6.4,0,0.0924,0.6920000000000001,0.000104,0.149,0.513,98.027,209438,4,"List(1, 14, List(), List(98.0, 0.8240000000000001, 0.588, 6.0, -6.4, 0.0, 0.0924, 0.6920000000000001, 1.0400000000000001E-4, 0.149, 0.513, 98.027, 209438.0, 4.0))","List(1, 14, List(), List(0.98, 0.8382502543234996, 0.5879916774318841, 0.5454545454545454, 0.8211650979394557, 0.0, 0.0971608832807571, 0.6947791164658635, 1.0420841683366733E-4, 0.149, 0.518705763397371, 0.4153721641708828, 0.13296340895936484, 0.8))"
1HfMVBKM75vxSfsQ5VefZ5,Selena Gomez,Lose You To Love Me,98,0.505,0.34,4,-9.005,1,0.0438,0.5760000000000001,0.0,0.21,0.0916,101.993,206459,4,"List(1, 14, List(), List(98.0, 0.505, 0.34, 4.0, -9.005, 1.0, 0.0438, 0.5760000000000001, 0.0, 0.21, 0.0916, 101.993, 206459.0, 4.0))","List(1, 14, List(), List(0.98, 0.5137334689725331, 0.3399866677306882, 0.36363636363636365, 0.7659416603069618, 1.0, 0.046056782334384865, 0.5783132530120483, 0.0, 0.21, 0.09261880687563194, 0.4321773913338248, 0.13095147241472233, 0.8))"
7aiClxsDWFRQ0Kzk5KI5ku,blackbear,hot girl bummer,97,0.778,0.5589999999999999,6,-7.109,0,0.0776,0.128,0.0,0.3989999999999999,0.6779999999999999,129.989,185093,1,"List(1, 14, List(), List(97.0, 0.778, 0.5589999999999999, 6.0, -7.109, 0.0, 0.0776, 0.128, 0.0, 0.39899999999999997, 0.6779999999999999, 129.989, 185093.0, 1.0))","List(1, 14, List(), List(0.97, 0.7914547304170906, 0.5589910916200507, 0.5454545454545454, 0.8061349953362164, 0.0, 0.08159831756046268, 0.12851405622489961, 0.0, 0.39899999999999997, 0.6855409504550048, 0.5508055153009771, 0.11652145018819272, 0.2))"
6cy3ki60hLwimwIje7tALf,The Black Eyed Peas,RITMO (Bad Boys For Life),97,0.721,0.716,10,-7.037000000000001,0,0.0657,0.0334,0.00084,0.237,0.667,104.994,221714,4,"List(1, 14, List(), List(97.0, 0.721, 0.716, 10.0, -7.037000000000001, 0.0, 0.0657, 0.0334, 8.4E-4, 0.237, 0.667, 104.994, 221714.0, 4.0))","List(1, 14, List(), List(0.97, 0.7334689725330621, 0.7159942630841143, 0.9090909090909092, 0.8076613245145425, 0.0, 0.0690851735015773, 0.03353413654618474, 8.416833667334669E-4, 0.237, 0.6744186046511627, 0.44489360079322704, 0.14125428946357138, 0.8))"
6WrI0LAC5M1Rw2MnX2ZvEg,Dua Lipa,Don't Start Now,97,0.794,0.7929999999999999,11,-4.521,0,0.0842,0.0125,0.0,0.0952,0.677,123.941,183290,4,"List(1, 14, List(), List(97.0, 0.794, 0.7929999999999999, 11.0, -4.521, 0.0, 0.0842, 0.0125, 0.0, 0.0952, 0.677, 123.941, 183290.0, 4.0))","List(1, 14, List(), List(0.97, 0.8077314343845372, 0.7929958185155339, 1.0, 0.8609980496904943, 0.0, 0.08853838065194533, 0.012550200803212853, 0.0, 0.0952, 0.6845298281092012, 0.5251781794760972, 0.11530375244062964, 0.8))"
696DnlkuDOXcMAnKlTgXXK,Arizona Zervas,ROXANNE,97,0.621,0.601,6,-5.6160000000000005,0,0.148,0.0522,0.0,0.46,0.457,116.735,163636,5,"List(1, 14, List(), List(97.0, 0.621, 0.601, 6.0, -5.6160000000000005, 0.0, 0.14800000000000002, 0.0522, 0.0, 0.46, 0.457, 116.735, 163636.0, 5.0))","List(1, 14, List(), List(0.97, 0.6317395727365209, 0.6009919400371887, 0.5454545454545454, 0.8377851267701178, 0.0, 0.1556256572029443, 0.052409638554216875, 0.0, 0.46, 0.46208291203235585, 0.49464402240696953, 0.10202996900712721, 1.0))"


In [0]:
# Perform a principal component analysis
# Note: Actually, with only 14 features, we opted to skip this step 

# from pyspark.ml.feature import PCA
# if ('pcaFeatures' in df_joined.columns): df_joined = df_joined.drop('pcaFeatures')
#
# import pandas as pd
# pd.DataFrame([sum(map(abs, _)) for _ in PCA(k=3, inputCol='scaledFeatures').fit(df_joined).pc.toArray()],
#   index=feature_columns).sort_values(0, ascending=False).rename(columns={0: 'PCA feature importance'})
#
# df_joined = PCA(k=10, inputCol='scaledFeatures', outputCol='pcaFeatures').fit(df_joined).transform(df_joined)
#
# df_joined.display()

In [0]:
# Initialize the kMeans model

from pyspark.ml.clustering import KMeans

model = KMeans(k=50, seed=42, featuresCol='features').fit(df_joined)

print(model, *model.extractParamMap().items(), sep='\n')

In [0]:
# Where are our cluster centers located?
# (These can be intepreted as something like feature-weights)

feature_weights = spark.createDataFrame(schema=['cluster', *feature_columns],
  data=[[i] + _.tolist() for i, _ in enumerate(model.clusterCenters())])

feature_weights.display()

cluster,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,51.019067796610166,0.6334671610169491,0.5174297796610172,5.271186440677966,-10.6143093220339,0.6165254237288136,0.1321875,0.4351521237288135,0.2096146715042373,0.1716144067796608,0.5195756779661019,123.90557838983052,139984.13983050847,3.836864406779661
1,13.40909090909091,0.3096909090909092,0.1998068181818181,5.727272727272728,-19.47540909090909,0.6363636363636364,0.0445590909090909,0.8272969545454546,0.6207977272727272,0.1688909090909091,0.1890181818181818,114.45718181818182,657678.9545454546,3.772727272727273
2,12.788732394366198,0.4407464788732392,0.1964882394366196,4.753521126760564,-23.08083802816901,0.795774647887324,0.1049042253521127,0.8553246732394364,0.6291991362676059,0.1876711267605633,0.5233330985915498,113.71642253521124,47783.42957746479,3.661971830985916
3,25.82828282828283,0.393939393939394,0.3389912121212119,5.343434343434344,-16.088777777777775,0.7474747474747475,0.0556454545454545,0.6491069686868687,0.4017554032323232,0.2167141414141412,0.3061747474747474,117.67563636363636,410562.393939394,3.8282828282828287
4,51.08931419457736,0.5844063795853273,0.5634974641148325,5.244019138755981,-9.05039074960127,0.6395534290271132,0.0961189792663475,0.3777385127910687,0.1100066389792664,0.2113972886762358,0.4628481658692189,119.12345295055827,229426.06698564597,3.9027113237639552
5,17.113636363636363,0.3537863636363637,0.1997372727272727,5.0227272727272725,-19.307409090909097,0.5909090909090909,0.0507636363636363,0.8101688636363633,0.6050410218181818,0.1537886363636363,0.2207954545454545,107.32747727272732,561123.4090909091,3.863636363636364
6,9.6,0.2698,0.1192,2.1,-21.7566,0.8,0.03876,0.9539,0.8008000000000002,0.11634,0.10652,109.4173,1038710.9,3.8
7,29.619909502262445,0.5335022624434392,0.3183603438914029,5.30316742081448,-16.738760180995477,0.6742081447963801,0.1188457013574661,0.68480243438914,0.4353766956561086,0.1628158371040724,0.4605321266968322,113.82868778280547,94655.371040724,3.7511312217194575
8,33.84065934065934,0.4417505494505494,0.3883847802197801,4.653846153846154,-14.006950549450552,0.6538461538461539,0.0685802197802197,0.5939697607142859,0.3211116706593404,0.2084532967032966,0.3336769230769232,112.3703021978021,332564.9615384616,3.895604395604396
9,55.77696078431372,0.6123860294117651,0.5903690686274511,5.08578431372549,-8.235870098039213,0.6311274509803921,0.1006591911764705,0.3337077044362749,0.0975262345588234,0.1960861519607841,0.4818344362745105,119.76491789215682,213437.4607843137,3.911764705882353


In [0]:
# Display those feature-weights as something like a heatmap
# (we need to normalize the weights for heatmap-display)

feature_weights = feature_weights.toPandas().set_index('cluster')

for feature in feature_columns: feature_min = feature_weights[feature].min(); \
  feature_weights[feature] = (feature_weights[feature] - feature_min) / (feature_weights[feature].max() - feature_min)

!pip install jinja2
display(feature_weights.style.background_gradient(cmap='coolwarm', vmin=0, vmax=1) \
        .format('{:.3%}').set_properties(**{'width':'111px', 'text-align':'center'}))

Unnamed: 0_level_0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,91.060%,93.953%,78.392%,60.259%,81.424%,9.524%,55.091%,19.930%,16.805%,49.650%,86.740%,99.555%,8.072%,67.373%
1,23.933%,19.662%,15.867%,68.660%,29.358%,14.205%,3.636%,79.042%,67.330%,47.840%,23.913%,78.842%,43.947%,54.545%
2,22.826%,49.733%,15.214%,50.723%,8.173%,51.816%,39.070%,83.267%,68.363%,60.325%,87.454%,77.218%,1.683%,32.394%
3,46.099%,38.993%,43.266%,61.590%,49.257%,40.420%,10.146%,52.182%,40.415%,79.634%,46.180%,85.897%,26.822%,65.657%
4,91.185%,82.696%,87.460%,59.758%,90.614%,14.957%,33.912%,11.276%,4.565%,76.099%,75.958%,89.071%,14.270%,80.542%
5,30.545%,29.780%,15.854%,55.682%,30.345%,3.480%,7.279%,76.460%,65.394%,37.799%,29.953%,63.212%,37.256%,72.727%
6,17.134%,10.509%,0.000%,1.842%,15.954%,52.813%,0.231%,98.126%,89.449%,12.902%,8.233%,67.794%,70.351%,60.000%
7,52.866%,71.016%,39.205%,60.848%,45.438%,23.133%,47.257%,57.562%,44.546%,43.801%,75.518%,77.464%,4.931%,50.226%
8,60.399%,49.963%,52.989%,48.887%,61.490%,18.329%,17.741%,43.870%,30.505%,74.142%,51.407%,74.267%,21.417%,79.121%
9,99.552%,89.116%,92.750%,56.843%,95.400%,12.969%,36.578%,4.639%,3.032%,65.920%,79.567%,90.477%,13.162%,82.353%


In [0]:
# Do the kMeans cluster prediction

model.setPredictionCol('cluster')
if ('cluster' in df_joined.columns): df_joined = df_joined.drop('cluster')

df_joined = model.transform(df_joined)

df_joined.display()

track_id,artist_name,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,features,scaledFeatures,cluster
2b8fOow8UzyDFAE27YhOZM,Maroon 5,Memories,100,0.764,0.32,11,-7.209,1,0.0546,0.8370000000000001,0.0,0.0822,0.575,91.019,189486,4,"List(1, 14, List(), List(100.0, 0.764, 0.32, 11.0, -7.209, 1.0, 0.0546, 0.8370000000000001, 0.0, 0.0822, 0.575, 91.01899999999999, 189486.0, 4.0))","List(1, 14, List(), List(1.0, 0.7772126144455749, 0.3199862637225272, 1.0, 0.8040150936996524, 1.0, 0.057413249211356474, 0.8403614457831327, 0.0, 0.0822, 0.5813953488372091, 0.3856769972626886, 0.11948836433408548, 0.8))",12
21jGcNKet2qwijlDFuPiPb,Post Malone,Circles,99,0.695,0.762,0,-3.497,1,0.0395,0.192,0.00244,0.0863,0.5529999999999999,120.042,215280,4,"List(1, 14, List(), List(99.0, 0.695, 0.762, 0.0, -3.497, 1.0, 0.0395, 0.192, 0.00244, 0.0863, 0.5529999999999999, 120.042, 215280.0, 4.0))","List(1, 14, List(), List(0.99, 0.7070193285859614, 0.7619951923028845, 0.0, 0.8827058424489104, 1.0, 0.041535226077812834, 0.1927710843373494, 0.0024448897795591178, 0.0863, 0.5591506572295246, 0.5086568530241782, 0.13690893876594473, 0.8))",9
3eekarcy7kvN4yt5ZFzltW,Travis Scott,HIGHEST IN THE ROOM,98,0.598,0.427,7,-8.764,0,0.0317,0.0546,5.83e-06,0.21,0.0605,76.469,175721,4,"List(1, 14, List(), List(98.0, 0.598, 0.42700000000000005, 7.0, -8.764, 0.0, 0.0317, 0.0546, 5.83E-6, 0.21, 0.0605, 76.469, 175721.0, 4.0))","List(1, 14, List(), List(0.98, 0.6083418107833164, 0.4269884251661884, 0.6363636363636364, 0.7710506232510811, 0.0, 0.03333333333333333, 0.05481927710843374, 5.8416833667334665E-6, 0.21, 0.06117290192113244, 0.3240239324062068, 0.11019185324412105, 0.8))",45
3ZCTVFBt2Brf31RLEnCkWJ,Billie Eilish,everything i wanted,98,0.7040000000000001,0.225,6,-14.454,0,0.0994,0.902,0.657,0.106,0.243,120.006,245426,4,"List(1, 14, List(), List(98.0, 0.7040000000000001, 0.225, 6.0, -14.454, 0.0, 0.0994, 0.902, 0.657, 0.106, 0.243, 120.006, 245426.0, 4.0))","List(1, 14, List(), List(0.98, 0.7161749745676502, 0.22498434468376263, 0.5454545454545454, 0.650428220130586, 0.0, 0.10452155625657204, 0.9056224899598394, 0.658316633266533, 0.106, 0.24570273003033363, 0.5085043093585538, 0.1572687370454992, 0.8))",28
2XU0oxnq2qxCpomAAuJY8K,Tones and I,Dance Monkey,98,0.8240000000000001,0.588,6,-6.4,0,0.0924,0.6920000000000001,0.000104,0.149,0.513,98.027,209438,4,"List(1, 14, List(), List(98.0, 0.8240000000000001, 0.588, 6.0, -6.4, 0.0, 0.0924, 0.6920000000000001, 1.0400000000000001E-4, 0.149, 0.513, 98.027, 209438.0, 4.0))","List(1, 14, List(), List(0.98, 0.8382502543234996, 0.5879916774318841, 0.5454545454545454, 0.8211650979394557, 0.0, 0.0971608832807571, 0.6947791164658635, 1.0420841683366733E-4, 0.149, 0.518705763397371, 0.4153721641708828, 0.13296340895936484, 0.8))",22
1HfMVBKM75vxSfsQ5VefZ5,Selena Gomez,Lose You To Love Me,98,0.505,0.34,4,-9.005,1,0.0438,0.5760000000000001,0.0,0.21,0.0916,101.993,206459,4,"List(1, 14, List(), List(98.0, 0.505, 0.34, 4.0, -9.005, 1.0, 0.0438, 0.5760000000000001, 0.0, 0.21, 0.0916, 101.993, 206459.0, 4.0))","List(1, 14, List(), List(0.98, 0.5137334689725331, 0.3399866677306882, 0.36363636363636365, 0.7659416603069618, 1.0, 0.046056782334384865, 0.5783132530120483, 0.0, 0.21, 0.09261880687563194, 0.4321773913338248, 0.13095147241472233, 0.8))",22
7aiClxsDWFRQ0Kzk5KI5ku,blackbear,hot girl bummer,97,0.778,0.5589999999999999,6,-7.109,0,0.0776,0.128,0.0,0.3989999999999999,0.6779999999999999,129.989,185093,1,"List(1, 14, List(), List(97.0, 0.778, 0.5589999999999999, 6.0, -7.109, 0.0, 0.0776, 0.128, 0.0, 0.39899999999999997, 0.6779999999999999, 129.989, 185093.0, 1.0))","List(1, 14, List(), List(0.97, 0.7914547304170906, 0.5589910916200507, 0.5454545454545454, 0.8061349953362164, 0.0, 0.08159831756046268, 0.12851405622489961, 0.0, 0.39899999999999997, 0.6855409504550048, 0.5508055153009771, 0.11652145018819272, 0.2))",24
6cy3ki60hLwimwIje7tALf,The Black Eyed Peas,RITMO (Bad Boys For Life),97,0.721,0.716,10,-7.037000000000001,0,0.0657,0.0334,0.00084,0.237,0.667,104.994,221714,4,"List(1, 14, List(), List(97.0, 0.721, 0.716, 10.0, -7.037000000000001, 0.0, 0.0657, 0.0334, 8.4E-4, 0.237, 0.667, 104.994, 221714.0, 4.0))","List(1, 14, List(), List(0.97, 0.7334689725330621, 0.7159942630841143, 0.9090909090909092, 0.8076613245145425, 0.0, 0.0690851735015773, 0.03353413654618474, 8.416833667334669E-4, 0.237, 0.6744186046511627, 0.44489360079322704, 0.14125428946357138, 0.8))",35
6WrI0LAC5M1Rw2MnX2ZvEg,Dua Lipa,Don't Start Now,97,0.794,0.7929999999999999,11,-4.521,0,0.0842,0.0125,0.0,0.0952,0.677,123.941,183290,4,"List(1, 14, List(), List(97.0, 0.794, 0.7929999999999999, 11.0, -4.521, 0.0, 0.0842, 0.0125, 0.0, 0.0952, 0.677, 123.941, 183290.0, 4.0))","List(1, 14, List(), List(0.97, 0.8077314343845372, 0.7929958185155339, 1.0, 0.8609980496904943, 0.0, 0.08853838065194533, 0.012550200803212853, 0.0, 0.0952, 0.6845298281092012, 0.5251781794760972, 0.11530375244062964, 0.8))",24
696DnlkuDOXcMAnKlTgXXK,Arizona Zervas,ROXANNE,97,0.621,0.601,6,-5.6160000000000005,0,0.148,0.0522,0.0,0.46,0.457,116.735,163636,5,"List(1, 14, List(), List(97.0, 0.621, 0.601, 6.0, -5.6160000000000005, 0.0, 0.14800000000000002, 0.0522, 0.0, 0.46, 0.457, 116.735, 163636.0, 5.0))","List(1, 14, List(), List(0.97, 0.6317395727365209, 0.6009919400371887, 0.5454545454545454, 0.8377851267701178, 0.0, 0.1556256572029443, 0.052409638554216875, 0.0, 0.46, 0.46208291203235585, 0.49464402240696953, 0.10202996900712721, 1.0))",15


In [0]:
# What sizes are the clusters we found?

cluster_counts = df_joined.groupBy('cluster').count()

cluster_counts.orderBy('count', ascending=False).display()

cluster,count
12,927
39,884
22,857
9,818
13,816
24,803
45,730
35,723
15,708
40,701


In [0]:
# What sizes are the clusters we found? (eye-candy)

cluster_counts.orderBy('cluster').display()

cluster,count
0,468
1,22
2,142
3,99
4,625
5,44
6,10
7,221
8,183
9,818


In [0]:
# Well, let's actually test our clustering. Choose a title we are coming from:

#df_joined.where("track_name like '%Rule Britannia%'").orderBy('artist_name', 'track_name').display()

source_track_id = '3OBr2Y0n4S0BWwA7SxKfwU'
source_details = df_joined.where(f"track_id == '{source_track_id}'")
source_cluster = source_details.select('cluster').head()[0]
source_details.drop(*feature_columns, 'features', 'scaledFeatures', 'pcaFeatures').display()

track_id,artist_name,track_name,cluster
3OBr2Y0n4S0BWwA7SxKfwU,Ludwig van Beethoven,"Beethoven: 5 Variations on ""Rule Britannia"" in D Major, WoO 79: Theme. Tempo moderato",2


In [0]:
# Which features are important for the cluster we are looking at?

source_top_weights = feature_weights.loc[source_cluster].nlargest(10)

print(f'Source title belongs to cluster {source_cluster}.',
      'Top weights for that cluster are:', source_top_weights, sep='\n')

In [0]:
# So, what can we recommend from this cluster?

df_joined.where(f'cluster == {source_cluster}') \
  .orderBy(*source_top_weights.index, ascending=False).display()

track_id,artist_name,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,features,scaledFeatures,cluster
7xSZbk9JBRXkI6oHrRF0uf,Johann Sebastian Bach,"Aria mit 30 Veränderungen, BWV 988 ""Goldberg Variations"": Var. 6 Canone alla Seconda a 1 Clav.",26,0.341,0.2739999999999999,7,-23.929,1,0.0475,0.989,0.934,0.1159999999999999,0.977,117.104,50973,4,"List(1, 14, List(), List(26.0, 0.341, 0.27399999999999997, 7.0, -23.929000000000002, 1.0, 0.0475, 0.9890000000000001, 0.934, 0.11599999999999999, 0.977, 117.104, 50973.0, 4.0))","List(1, 14, List(), List(0.26, 0.3468972533062055, 0.27398533450375695, 0.6363636363636364, 0.4495675400661409, 1.0, 0.04994742376445847, 0.9929718875502009, 0.9358717434869739, 0.11599999999999999, 0.9878665318503537, 0.49620759497961847, 0.025940406426040227, 0.8))",2
0WZCZ7u8yQFNR60tyi3f5F,Johann Sebastian Bach,Partita n. 6 in Mi Minore: Tempo di gavotta,0,0.421,0.483,7,-15.425999999999998,1,0.0443,0.988,0.905,0.17,0.968,120.679,50233,3,"List(1, 14, List(), List(0.0, 0.42100000000000004, 0.483, 7.0, -15.425999999999998, 1.0, 0.0443, 0.988, 0.905, 0.17, 0.968, 120.679, 50233.0, 3.0))","List(1, 14, List(), List(0.0, 0.42828077314343854, 0.48298955638903907, 0.6363636363636364, 0.6298227762231833, 1.0, 0.046582544689800214, 0.9919678714859438, 0.906813627254509, 0.17, 0.9787664307381191, 0.511356028440919, 0.02544063031223175, 0.6000000000000001))",2
5RyJw8bU1s4jHGYvRnHcdA,Chris Schweizer,Scorpion (ASOT 942),38,0.6729999999999999,0.994,1,-8.529,0,0.0596,9.26e-05,0.659,0.141,0.964,137.858,48000,4,"List(1, 14, List(), List(38.0, 0.6729999999999999, 0.9940000000000001, 1.0, -8.529, 0.0, 0.0596, 9.259999999999999E-5, 0.659, 0.141, 0.9640000000000001, 137.858, 48000.0, 4.0))","List(1, 14, List(), List(0.38, 0.6846388606307222, 0.9939998787975518, 0.09090909090909091, 0.7760323920970067, 0.0, 0.06267087276550999, 9.297188755020079E-5, 0.6603206412825651, 0.141, 0.9747219413549039, 0.5841490182120187, 0.023932522120158336, 0.8))",2
49k3QQpaay4HrNegKre24I,Ludwig van Beethoven,"Beethoven: 6 Variations on an Original Theme in G Major, WoO 77: Variation II",2,0.3229999999999999,0.139,7,-21.1,1,0.0395,0.988,0.8009999999999999,0.128,0.962,75.154,51973,4,"List(1, 14, List(), List(2.0, 0.32299999999999995, 0.139, 7.0, -21.1, 1.0, 0.0395, 0.988, 0.8009999999999999, 0.128, 0.9620000000000001, 75.154, 51973.0, 4.0))","List(1, 14, List(), List(0.02, 0.3285859613428281, 0.1389826074486705, 0.6363636363636364, 0.5095395573645383, 1.0, 0.041535226077812834, 0.9919678714859438, 0.8026052104208415, 0.128, 0.9726996966632961, 0.3184518512868753, 0.02661577955280844, 0.8))",2
6DN9DnsD2Cpm3Js2GZjuGP,Ludwig van Beethoven,"Beethoven: 7 Variations on ""God Save the King"" in C Major, WoO 78: Variation II",1,0.327,0.166,0,-28.857,1,0.0422,0.989,0.773,0.139,0.961,102.219,50867,4,"List(1, 14, List(), List(1.0, 0.327, 0.166, 0.0, -28.857, 1.0, 0.0422, 0.9890000000000001, 0.773, 0.139, 0.961, 102.219, 50867.0, 4.0))","List(1, 14, List(), List(0.01, 0.33265513733468977, 0.1659831528596878, 0.0, 0.34509878741626393, 1.0, 0.04437434279705574, 0.9929718875502009, 0.7745490981963927, 0.139, 0.9716885743174922, 0.4331350265680217, 0.025868816874602796, 0.8))",2
0xmkgfnEnekpv7JBZfP65W,Ludwig van Beethoven,"Beethoven: 7 Variations on ""God Save the King"" in C Major, WoO 78: Variation I",1,0.3229999999999999,0.171,0,-30.54,1,0.0662,0.996,0.871,0.153,0.96,208.959,48107,4,"List(1, 14, List(), List(1.0, 0.32299999999999995, 0.171, 0.0, -30.54, 1.0, 0.0662, 0.996, 0.871, 0.153, 0.96, 208.959, 48107.0, 4.0))","List(1, 14, List(), List(0.01, 0.3285859613428281, 0.17098325386172802, 0.0, 0.3094208428728908, 1.0, 0.06961093585699264, 1.0, 0.8727454909819639, 0.153, 0.9706774519716883, 0.8854269951440267, 0.024004787044722534, 0.8))",2
6Mm9zn5aZAq2qdVl3a6ZP7,Ludwig van Beethoven,"Beethoven: Piano Sonata No. 12 in A-Flat Major, Op. 26: I. (c) Variation II -",4,0.273,0.0876,8,-32.648,1,0.0318,0.989,0.855,0.104,0.96,135.659,47293,4,"List(1, 14, List(), List(4.0, 0.273, 0.0876, 8.0, -32.648, 1.0, 0.0318, 0.9890000000000001, 0.855, 0.10400000000000001, 0.96, 135.659, 47293.0, 4.0))","List(1, 14, List(), List(0.04, 0.2777212614445575, 0.08758156914769678, 0.7272727272727273, 0.26473331637412023, 1.0, 0.03343848580441641, 0.9929718875502009, 0.8567134268537073, 0.10400000000000001, 0.9706774519716883, 0.5748311426368019, 0.02345503331953321, 0.8))",2
0mNSNnU3nbQzNqNrV7PjH1,Armin van Buuren,"A State Of Trance (ASOT 943) - Coming Up, Pt. 1",31,0.521,0.797,11,-7.962999999999999,0,0.267,0.0267,0.0,0.341,0.96,86.03399999999999,43560,3,"List(1, 14, List(), List(31.0, 0.521, 0.797, 11.0, -7.962999999999999, 0.0, 0.267, 0.0267, 0.0, 0.341, 0.96, 86.03399999999999, 43560.0, 3.0))","List(1, 14, List(), List(0.31, 0.5300101729399798, 0.7969958993171663, 1.0, 0.7880310353599593, 0.0, 0.28075709779179814, 0.026807228915662652, 0.0, 0.341, 0.9706774519716883, 0.36455393689777027, 0.020933865437307478, 0.6000000000000001))",2
43hoHY4j9wLbe2ZAtzD8pr,Ludwig van Beethoven,"Beethoven: 12 Variations on Haibel's ""Menuet à la Viganò"" in C Major, WoO 68: Variation I",1,0.492,0.204,0,-28.804,1,0.0438,0.995,0.912,0.176,0.953,128.489,46080,1,"List(1, 14, List(), List(1.0, 0.49200000000000005, 0.204, 0.0, -28.804000000000002, 1.0, 0.0438, 0.995, 0.912, 0.17600000000000002, 0.953, 128.489, 46080.0, 1.0))","List(1, 14, List(), List(0.01, 0.5005086469989828, 0.2039839204751936, 0.0, 0.3462223352836428, 1.0, 0.046056782334384865, 0.9989959839357431, 0.9138276553106212, 0.17600000000000002, 0.9635995955510614, 0.5444495292332986, 0.02263580571676337, 0.2))",2
1AgfVr7Cp2BQR5u9bkgOhI,Ludwig van Beethoven,"Beethoven: 12 Variations on a Russian Dance from Wranitzky's ""The Forest Maiden"" in A Major, WoO 71: Variation VI",1,0.4639999999999999,0.0615,9,-31.856,1,0.0452,0.993,0.909,0.159,0.932,147.725,47587,4,"List(1, 14, List(), List(1.0, 0.46399999999999997, 0.0615, 9.0, -31.855999999999998, 1.0, 0.0452, 0.993, 0.909, 0.159, 0.932, 147.725, 47587.0, 4.0))","List(1, 14, List(), List(0.01, 0.47202441505595116, 0.06148104191704673, 0.8181818181818182, 0.2815229373357077, 1.0, 0.047528916929547846, 0.996987951807229, 0.9108216432865731, 0.159, 0.9423660262891809, 0.6259586945652081, 0.023653593018803064, 0.8))",2


In [0]:
# According to our project requirement, dump the data back to our RDS instance.
# We chose to write to a separate table, creating it on the fly

!pip install pymysql sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.dialects.mysql import VARCHAR, TINYINT, MEDIUMINT

dtypes = dict(track_id=VARCHAR(25), artist_name=VARCHAR(60), track_name=VARCHAR(250))
dtypes.update({}.fromkeys([col for col, dtype in df_joined.dtypes if dtype == 'int'], TINYINT))
dtypes['duration_ms'] = MEDIUMINT

df_joined.drop('features', 'scaledFeatures', 'pcaFeatures').toPandas().set_index('track_id') \
  .to_sql(name='clustered', if_exists='replace', dtype=dtypes, method='multi',
    con=create_engine(f'mysql+pymysql://{rds_user}:{rds_pwd}@{rds_host}/{rds_db}'))