In [15]:
# Imports
import numpy as np 
import pandas as pd
from scipy import stats
from scipy.sparse import csr_matrix
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture


CATEGORY_DICT = {0: "All", 1: "Acoustic", 2: "Chill", 3: "Dance", 4: "Happy", 5: "Loud", 6: "A Capella", 7: "Alternative", 8: "Blues", 9: "Classical", 10: "Country", 11: "Dance", 12: "Electronic", 13: "Folk", 14: "Hip-Hop", 15: "Indie", 16: "Jazz", 17: "Movie", 18: "Opera", 19: "Pop", 20: "R&B", 21: "Rap", 22: "Reggae", 23: "Reggaeton", 24: "Rock", 25: "Ska", 26: "Soul", 27: "Soundtrack", 28: "World"}
# All dataset paths
path_all = "/content/SpotifyFeatures_Cleaned.csv" 
path_acoustic = "/content/SpotifyFeatures_Cleaned_Acoustic.csv"
path_chill = "/content/SpotifyFeatures_Cleaned_Chill.csv"
path_dance = "/content/SpotifyFeatures_Cleaned_Dance.csv"
path_happy = "/content/SpotifyFeatures_Cleaned_Happy.csv"
path_loud = "/content/SpotifyFeatures_Cleaned_Loud.csv"

def get_data(csv_path:str)->pd.DataFrame:
  """
    Return a pd.DataFrame for the csv at csv_path
  """
  df = pd.read_csv(csv_path)
  if "Unnamed: 0" in df.columns:
    df = df.drop(["Unnamed: 0"], axis = 1)
  if "cluster" in df.columns:
    df = df.drop(["cluster"], axis = 1)
  return df

def select_data(category:int)->pd.DataFrame:
  """
    Return the data set corresponds to the category number in the CATEGORY_DICT
  """
  if category == 1:
    return data_acoustic
  elif category == 2:
    return data_chill
  elif category == 3:
    return data_dance
  elif category == 4:
    return data_happy
  elif category == 5:
    return data_loud
  elif category > 5 and category < 29:
    genre = CATEGORY_DICT[category]
    return data_all[data_all["genre"] == genre]
  else:
    return data_all

def recommend_listener(input_song:pd.DataFrame, category:int, num_songs_to_rec:int)->pd.DataFrame:
  """
  Inpus: 
    input_song: a pd.DataFrame of listener input song, the order of the columns should be 
                'track_id', 'popularity', 'acousticness', 'danceability', 'liveness',
                'loudness', 'speechiness', 'tempo', 'valence', 'genre', 'artist_name',
                'track_name'
    category: an integer representing the category the listener choose, see what each number 
              corresponds to in the CATEGORY_DICT
    num_songs_to_rec an interger representing the number songs the listener what to be recommended
  """
  # Select the right data set 
  data = select_data(category)


  if category== 0:
    n_comps = 5
  else:
    n_comps = 5
  # Find k
  if num_songs_to_rec > 100:
    print("We can only recommend you 100 songs.")
    k = 100
  else:
    k = num_songs_to_rec
  
  # Min max scale data
  scaler = MinMaxScaler()
  num_data = data.select_dtypes(exclude=['object'])
  num_data = num_data.drop(['popularity'], axis=1)
  scaler = scaler.fit(num_data)
  data = data.drop(num_data.columns, axis=1)
  data[num_data.columns] = scaler.transform(num_data)

  # Min max scale input song
  num_input_song = input_song.select_dtypes(exclude=['object'])
  num_input_song = num_input_song.drop(['popularity'], axis=1)
  input_song = input_song.drop(num_input_song.columns, axis=1)
  input_song[num_input_song.columns] = scaler.transform(num_input_song)

  # GMM modeling 
  data_sub = data.iloc[:int(len(data)*.70)]
  gmm = GaussianMixture(n_components=n_comps).fit(data_sub.drop(["track_id", "popularity", "genre", "artist_name", "track_name"], axis=1)) 

  labels = gmm.predict_proba(data_sub.drop(["track_id", "popularity", "genre", "artist_name", "track_name"], axis=1))
  data_id = data_sub["track_id"].reset_index()  
  data_labels = pd.DataFrame(labels).reset_index()  
  data_gmm = pd.concat([data_id, data_labels], axis = 1).drop(["index"], axis=1)
  input_song_id = input_song["track_id"].reset_index() 
  input_song_label = pd.DataFrame(gmm.predict_proba(input_song.drop(["track_id", "popularity", "genre", "artist_name", "track_name"], axis=1))).reset_index() 
  input_gmm = pd.concat([input_song_id, input_song_label], axis = 1).drop(["index"], axis=1)

  knn_gmm = NearestNeighbors(n_neighbors = k)
  knn_gmm.fit(data_gmm.drop(["track_id"], axis=1))
  gmm_neighbors = knn_gmm.kneighbors(input_gmm.drop(["track_id"], axis=1), return_distance=False)
  df_rec_gmm = data_sub.iloc[gmm_neighbors.tolist()[0]]

  # KNN modeling 
  knn = NearestNeighbors(n_neighbors = k)
  knn.fit(data.drop(["track_id", "popularity", "genre", "artist_name", "track_name"], axis=1))
  neighbors = knn.kneighbors(input_song.drop(["track_id", "popularity", "genre", "artist_name", "track_name"], axis=1), return_distance=False)


  # Return a dataframe of rec 
  df_rec = data.iloc[neighbors.tolist()[0]]
  return df_rec, df_rec_gmm




# Main

In [13]:
# Load all data
data_all = get_data(path_all)
data_acoustic = get_data(path_acoustic)
data_chill = get_data(path_chill)
data_dance = get_data(path_dance)
data_happy = get_data(path_happy)
data_loud = get_data(path_loud)

In [3]:
# Get input song(using a song in data set to test)
input_song = data_all.tail(1).copy()
input_song

Unnamed: 0,track_id,popularity,acousticness,danceability,liveness,loudness,speechiness,tempo,valence,genre,artist_name,track_name
142682,7zzZmpw8L66ZPjH1M6qmOs,67.0,0.217,0.664,0.118,-6.0,0.029,115.049,0.221,Blues,Moonstar88,Migraine


In [16]:
# Get recs
category = 8
num_songs_to_rec = 8
a,b =recommend_listener(input_song, category, num_songs_to_rec)
a

Unnamed: 0,track_id,popularity,genre,artist_name,track_name,acousticness,danceability,liveness,loudness,speechiness,tempo,valence
142682,7zzZmpw8L66ZPjH1M6qmOs,67.0,Blues,Moonstar88,Migraine,0.220303,0.662378,0.102351,0.725347,0.006593,0.420858,0.197855
140498,7sUW7X7blDODuD4kX0RRLu,51.0,Blues,NEEDTOBREATHE,Bridges Burn,0.194923,0.65138,0.116913,0.756513,0.006928,0.333123,0.182842
61006,3KZRILv6sbLbpHiCgaMwAp,50.0,Blues,Alina Baraz,Electric (Electric Mantis Remix),0.128933,0.671176,0.121073,0.677682,0.038887,0.465186,0.208579
27997,1XO1QUeaqLjEtoA2zcHz9m,50.0,Blues,Ruelle,Bad Dream,0.106598,0.618388,0.092989,0.702962,0.012068,0.45166,0.219303
69182,3llC7WjMtxqDwaM1Gjzvrs,51.0,Blues,The Japanese House,Face Like Thunder,0.22741,0.621687,0.16684,0.693458,0.018885,0.451503,0.295442
94729,5ADZl5Ty5O9EGy8ABSqXCg,52.0,Blues,Papa Roach,Periscope (feat. Skylar Grey),0.287309,0.584296,0.195964,0.696739,0.007822,0.464116,0.228954
1885,06KTcf9q6ef0rEYPgjOKl3,55.428571,Blues,Foreign Air,In the Shadows,0.158374,0.595293,0.09403,0.628618,0.013298,0.427117,0.117426
96227,5FKaV2J0xiwRENDsl0bMn4,53.333333,Blues,Danny Brown,25 Bucks (feat. Purity Ring),0.324872,0.727263,0.129395,0.671797,0.06727,0.426942,0.250402


In [8]:
b

Unnamed: 0,track_id,popularity,genre,artist_name,track_name,acousticness,danceability,liveness,loudness,speechiness,tempo,valence
142682,7zzZmpw8L66ZPjH1M6qmOs,67.0,Blues,Moonstar88,Migraine,0.217871,0.652726,0.109388,0.859646,0.007213,0.404213,0.221
26172,1RCtHLyq1xIbgGMrYRrKJ2,59.0,Dance,The Church,Under the Milky Way,0.055019,0.560262,0.116456,0.72242,0.004985,0.504165,0.519
51137,2nKDawaFrMjOH9Y2Vl136C,25.0,Reggae,Ballyhoo!,The Quest,0.198794,0.802172,0.085759,0.841531,0.030229,0.342953,0.697
24041,1K0zyNBy7EnLi1EIvTQH3n,30.0,Ska,Descendents,Ace,0.057328,0.254919,0.154827,0.741886,0.024501,0.63999,0.468
119394,6W2VbtvMrDXm5vYeB7amkO,70.0,Pop,Kenny Loggins,Footloose,0.082228,0.525857,0.067785,0.837645,0.037866,0.686068,0.494
7404,0PCMtuN7aHuNpHwzUTZEre,48.0,Country,Hunter Brothers,Want,0.20281,0.623696,0.092222,0.880019,0.007743,0.341774,0.318
39264,29dfnKJ6WvYcJpgQ0MKzX7,62.0,Dance,Mike Posner,Song About You,0.180722,0.665627,0.09101,0.915011,0.020153,0.270427,0.641
13369,0jTl2gEoJHI86UHjfd29wU,44.0,Folk,Johnnyswim,Let It Matter,0.108433,0.532308,0.125544,0.83696,0.03235,0.212728,0.285
