# DS 4300 HW 5 Spotify

### Sampling and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# read the data
data = pd.read_csv('spotify.csv')
data.track_name = data.track_name.str.replace("'","")

# sample the data (top 50 popular songs of each genre)
n = 20
sample = data.groupby('track_genre', group_keys=False).apply(lambda x: x.sort_values(by='popularity',ascending=False).head(n))

# scale the numeric data
scaler = MinMaxScaler()
scaled = pd.DataFrame(scaler.fit_transform(sample.iloc[:, 5:-1]), columns=sample.iloc[:, 5:-1].columns)

# retrieve relevant columns
song_data = sample.iloc[:, [1,2,3,4,-1]].reset_index()
scaled_data = scaled.loc[:, ['valence', 'tempo', 'popularity']]
df = pd.merge(song_data, scaled_data, left_index=True, right_index=True)

# drop nulls and encode categorical variables
df = df.dropna()
df = df.drop_duplicates(subset=['artists', 'track_name'])

df.head(5)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,index,track_id,artists,album_name,track_name,track_genre,valence,tempo,popularity
0,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,acoustic,0.170582,0.572393,0.793103
1,7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,We Sing. We Dance. We Steal Things.,Im Yours,acoustic,0.727273,0.720377,0.770115
3,60,08MFgEQeVLF37EyZ7jcwLc,Zack Tabudlo,Pano,Pano,acoustic,0.423902,0.834327,0.712644
4,10,4mzP5mHkRvGxdhdGdAH7EJ,Zack Tabudlo,Episode,Give Me Your Forever,acoustic,0.307457,0.476744,0.701149
5,6,6Vc5wAMmXdKIAM7WUoEb7N,A Great Big World;Christina Aguilera,Is There Anybody Out There?,Say Something,acoustic,0.078141,0.674203,0.701149


In [3]:
df[df.artists == 'The Strokes']

Unnamed: 0,index,track_id,artists,album_name,track_name,track_genre,valence,tempo,popularity
769,38015,5ruzrDWcT0vuJIOMW7gMnW,The Strokes,The New Abnormal,The Adults Are Talking,garage,0.662921,0.78718,0.758621
777,38086,57Xjny5yNzAcsxnusKmAfA,The Strokes,Room On Fire,Reptilia,garage,0.786517,0.754014,0.712644
779,38074,7kzKAuUzOITUauHAhoMoxA,The Strokes,Is This It,Last Nite,garage,0.783453,0.496485,0.701149


### Similarity Scores

In [4]:
# assemble features
encoded_df = pd.get_dummies(df, columns=['track_genre'])
features = encoded_df.iloc[:,5:]
features.shape

# compute similarity scores
cosine_sim = cosine_similarity(features, features)
cosine_sim_df = pd.DataFrame(cosine_sim, index=df.track_id, columns=df.track_id)
cosine_sim_df.head(5)

track_id,5vjLSffimiIP26QG5WcN2K,1EzrEOXmMH3G43AXT1y7pA,08MFgEQeVLF37EyZ7jcwLc,4mzP5mHkRvGxdhdGdAH7EJ,6Vc5wAMmXdKIAM7WUoEb7N,0IktbUcnAGrvD03AWnz3Q8,5MYPzdIWgx3pMLRGlq2fVq,4E6cwWJWZw2zWf7VFbH7wf,5p9XWUdvbUzmPCukOmwoU3,5SuOikwiRyPMVoIQDJUgSV,...,0H4BGX7L8UfoD0g4wgXd0D,0jOSpB2zK2O0SNxj8ZT5lp,35WtFQK3iG2WPjstuuKaVc,6BxW8Umq3QIbipmo2c7WYz,40lKptao1hxVqA7fd1OOGp,0YxrPdW9NpNFtqh7Fw0rva,0qf2iUDfPDtrLeJFmOvVn6,72B1Omo2kmEst4YKO8vOB6,50lW1fKoDtyKaiR2bR7ksl,3MRqm3VNWLIjRiLRQezul7
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5vjLSffimiIP26QG5WcN2K,1.0,0.93762,0.972169,0.991455,0.993087,0.940001,0.968264,0.956787,0.934105,0.917756,...,0.465823,0.465138,0.453934,0.458667,0.43374,0.413454,0.459715,0.397236,0.47386,0.44477
1EzrEOXmMH3G43AXT1y7pA,0.93762,1.0,0.979727,0.9628,0.91713,0.998443,0.987811,0.995705,0.9909,0.984305,...,0.500504,0.465861,0.456087,0.477513,0.464688,0.461665,0.535378,0.408717,0.509158,0.452082
08MFgEQeVLF37EyZ7jcwLc,0.972169,0.979727,1.0,0.975316,0.971303,0.976122,0.977803,0.981881,0.959015,0.942563,...,0.515375,0.503249,0.488115,0.500938,0.467398,0.445318,0.525434,0.409622,0.534134,0.477902
4mzP5mHkRvGxdhdGdAH7EJ,0.991455,0.9628,0.975316,1.0,0.976345,0.968945,0.990629,0.981285,0.969203,0.958573,...,0.439302,0.425154,0.416743,0.426957,0.412818,0.403286,0.448992,0.37594,0.443357,0.411154
6Vc5wAMmXdKIAM7WUoEb7N,0.993087,0.91713,0.971303,0.976345,1.0,0.917902,0.94683,0.936859,0.904459,0.88288,...,0.467243,0.476727,0.461768,0.463446,0.425,0.393863,0.451228,0.383171,0.483875,0.449056


In [5]:
def unique_combinations(elements):
    return list(itertools.combinations(elements, 2))

track_pairs = unique_combinations(df.track_id.values)
sim_df = pd.DataFrame(track_pairs, columns=['track1', 'track2'])
sim_df['simscore'] = [cosine_sim_df[x][y] for x,y in track_pairs]
sim_df.head(5)

Unnamed: 0,track1,track2,simscore
0,5vjLSffimiIP26QG5WcN2K,1EzrEOXmMH3G43AXT1y7pA,0.93762
1,5vjLSffimiIP26QG5WcN2K,08MFgEQeVLF37EyZ7jcwLc,0.972169
2,5vjLSffimiIP26QG5WcN2K,4mzP5mHkRvGxdhdGdAH7EJ,0.991455
3,5vjLSffimiIP26QG5WcN2K,6Vc5wAMmXdKIAM7WUoEb7N,0.993087
4,5vjLSffimiIP26QG5WcN2K,0IktbUcnAGrvD03AWnz3Q8,0.940001


In [6]:
song_df = df.iloc[:, 1:6]

sim_df = pd.merge(sim_df, song_df, 
                  left_on='track1', 
                  right_on="track_id").drop('track_id', axis=1).rename(columns={'track_name':'song1',
                                                                                'track_genre': 'genre1',
                                                                                'album_name': 'album1',
                                                                                'artists': 'artists1'})
sim_df = pd.merge(sim_df, song_df, 
                  left_on='track2', 
                  right_on="track_id").drop('track_id', axis=1).rename(columns={'track_name':'song2',
                                                                                'track_genre': 'genre2',
                                                                                'album_name': 'album2',
                                                                                'artists': 'artists2'})
sim_df.head(5)

Unnamed: 0,track1,track2,simscore,artists1,album1,song1,genre1,artists2,album2,song2,genre2
0,5vjLSffimiIP26QG5WcN2K,1EzrEOXmMH3G43AXT1y7pA,0.93762,Chord Overstreet,Hold On,Hold On,acoustic,Jason Mraz,We Sing. We Dance. We Steal Things.,Im Yours,acoustic
1,5vjLSffimiIP26QG5WcN2K,08MFgEQeVLF37EyZ7jcwLc,0.972169,Chord Overstreet,Hold On,Hold On,acoustic,Zack Tabudlo,Pano,Pano,acoustic
2,1EzrEOXmMH3G43AXT1y7pA,08MFgEQeVLF37EyZ7jcwLc,0.979727,Jason Mraz,We Sing. We Dance. We Steal Things.,Im Yours,acoustic,Zack Tabudlo,Pano,Pano,acoustic
3,5vjLSffimiIP26QG5WcN2K,4mzP5mHkRvGxdhdGdAH7EJ,0.991455,Chord Overstreet,Hold On,Hold On,acoustic,Zack Tabudlo,Episode,Give Me Your Forever,acoustic
4,1EzrEOXmMH3G43AXT1y7pA,4mzP5mHkRvGxdhdGdAH7EJ,0.9628,Jason Mraz,We Sing. We Dance. We Steal Things.,Im Yours,acoustic,Zack Tabudlo,Episode,Give Me Your Forever,acoustic


In [7]:
# export as csv
df.to_csv('spotify_test.csv')