# Spotify Music Recommender System

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
# import json

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

import spotipy
from spotipy.oauth2 import SpotifyOAuth, SpotifyClientCredentials

import pinecone
from tqdm._tqdm_notebook import tqdm_notebook

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


## 2. Read Data

Let's get the merged and cleaned data from the previous notebook in order to do some feature engineering and preprocessing in this notebook.

In [38]:
all_song_ft = pd.read_pickle("./data/all_song_ft.pkl")
final_song_ft = pd.read_pickle("./data/final_song_ft.pkl")

In [3]:
display(final_song_ft.shape)
final_song_ft.head()

(450000, 2408)

Unnamed: 0,track_id,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,zikir,zillertal,zim,zither,zolo,zouglou,zouk,zuliana,zurich,zydeco
0,1qFqi7NahjuFxGV3H2FNmb,0.16,0.468214,0.337,0.454545,0.740103,1.0,0.052317,0.696787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4iUxT3nhVmAsUj7Jm5ZeJy,0.19,0.162462,0.267,0.818182,0.663011,1.0,0.03862,0.928715,0.979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18Gd6vSbxKS4mrEvjz14oS,0.15,0.383451,0.297,0.181818,0.667875,0.0,0.033368,0.39257,1.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,32QnXosZq7A11knnBAEqk7,0.59,0.717457,0.315,0.0,0.746283,1.0,0.035015,0.74498,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0JBjiB2QZSYPVUEnUmSroj,0.38,0.507568,0.928,0.0,0.835934,1.0,0.03275,0.002912,1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5. Create User Music Taste Vector 

### 5.1 Get user data via Spotify API

In [44]:
# Set up client id 
cid = 'b48ee249a013440f95bf48fa3fa5757b'
secret = '041860d39b5c4e51ba41e4845bbef3c9'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [5]:
user_playlists = sp.user_playlists('1w36quu8tj0kp810d9s7gngke')

In [46]:
# Each 'item' in 'items' is a track, each track is a dictionary
song_lst = sp.playlist_tracks('https://open.spotify.com/playlist/3URf6gpqnl83P7A28iHfTf?si=e78625adf8fa42a9')['items']

In [59]:
song_lst[0]['track']['id']

# [song for song in song_lst]

'3m7IYU7ySuFMwdm7OCShQN'

In [74]:
# Define a function that takes in a list of dict where each dict is a song that a user listened to
# and returns a summarized vector for the songs that exist in the original Spotify dataset
# song_lst = sp.playlist_tracks(<playlist_url>)['items']
def song_vec(song_lst):
    # Store all track_ids in a list 
    lst = [] 
    for song in song_lst:
        track_id = song['track']['id']
        lst.append(track_id)
        
    # Create a df for all songs in the playlist that exist in the Spotify dataset     
    song_df = all_song_ft[all_song_ft['track_id'].isin(lst)] ## should I change this to all_song_ft instead?
    
    # Summarize the playlist into a single vector by taking the mean of all songs in the playlist 
    playlist_vec = song_df.iloc[:, 1:].mean().tolist()
    playlist_songids = lst
    
    return playlist_vec, playlist_songids

In [76]:
playlist_vec = song_vec(song_lst)[0]
playlist_songids = song_vec(song_lst)[1]

In [78]:
playlist_songids

['3m7IYU7ySuFMwdm7OCShQN',
 '6HMtHNpW6YPi1hrw9tgF8P',
 '7mFj0LlWtEJaEigguaWqYh',
 '0hquQWY3xvYqN4qtiquniF',
 '1qEmFfgcLObUfQm0j1W2CK',
 '4Dvkj6JhhA12EX05fT7y2e',
 '5AAaOy11GI6kZV7iBSWfOa',
 '5xwBIieMMFUmLDgvG4DjFe',
 '0Yn2yzQyASscYbjOl8Kflh',
 '1BJIJ69DZNip7Erq6u69mu',
 '71yN0yrHej3jhKXewbmtEh',
 '76Tuo484SLohJakHLnGI3B',
 '63xdwScd1Ai1GigAwQxE8y',
 '2eF8pWbiivYsYRpbntYsnc',
 '1QvWxgZvTU0w8rlPRE5Zrv',
 '7tqhbajSfrz2F7E1Z75ASX',
 '5FLRDjcoOwVb6UocUoGhvj',
 '7GuP2hQO9oprADjY3gRhCT',
 '1GIH7Wcdc4PEkiFJTTOtyn',
 '0loJyuSFr6vVPBQSmLLrrQ',
 '7oOOI85fVQvVnK5ynNMdW7',
 '5lA3pwMkBdd24StM90QrNR',
 '2KukL7UlQ8TdvpaA7bY3ZJ',
 '62GXGpd73vslqIBHq8XqOx',
 '6VLEVV3KmHCiGHrY9qlXqB',
 '1m5demYkTI5BB9sTx5v13h',
 '1RI4YQVFh7onQD07QuL8ND',
 '7oqftogUN82Q7VNy2TmTJW',
 '6tgEc2O1uFHcZDKPoo6PC8',
 '6jG2YzhxptolDzLHTGLt7S',
 '6wwrYruEgWlowPDZMq5116',
 '5hgnY0mVcVetszbb85qeDg',
 '63wKtmUg754umO8SglEL79',
 '63FrXif0Pdu4NAPvTh87mw',
 '78eouBKVRyhbSzJwChr6QM',
 '2RQAG0wQt35UzAPEyVJFWN',
 '0lnIJmgcUpEpe4AZACjayW',
 

In [69]:
# sp.track('3m7IYU7ySuFMwdm7OCShQN')

## 7. Set up Pinecone Database

In this section, we set up Pinecone...
- Explain what Pinecone, how it works
- Include quickstart guide

### Prepare `vector` in `final_song_ft` for upload to Pinecone

In [9]:
# final_song_ft['vector'] = final_song_ft.drop(columns=['track_id']).values.tolist()

In [10]:
# final_song_ft.to_pickle('./data/final_song_ft_vectorized.pkl')

### Initialize pinecone

In [11]:
# Initialize pinecone, make sure API key works
pinecone.init(api_key='3b9bd9d9-88f0-4911-a7e9-d589e7447fa1', 
             environment="us-west1-gcp")

In [12]:
# Create new index
# pinecone.create_index('spotifyv1', dimension=2407, metric='cosine')

In [13]:
pinecone.list_indexes()

['spotifyv1']

In [14]:
index = pinecone.Index("spotifyv1")

In [22]:
# # Test upsert, 45 songs

# test_df = final_song_ft.iloc[:100, :]
# ids = test_df['track_id']
# vecs = test_df.drop(columns=['track_id']).values.tolist()
# metadata = [{"track_id" : track_id} for track_id in test_df['track_id']]
# # test_data = zip(ids, vecs, metadata)
# # vecs = [[random.random() for i in range(2407)] for i in range(5)]
# index.upsert(vectors=zip(ids, vecs, metadata))

# # vecs

{'upserted_count': 100}

In [23]:
batch_size = 100

for start_idx in tqdm_notebook(range(0, len(final_song_ft), batch_size)):
    ids = final_song_ft.iloc[start_idx:start_idx+batch_size-1]["track_id"].tolist()
    # vecs = final_song_ft.iloc[start_idx:start_idx+batch_size-1]["vector"].tolist()
    vecs = final_song_ft.iloc[start_idx:start_idx+batch_size-1].drop(columns=['track_id']).values.tolist()
    metadata = [{"track_id" : track_id} for track_id in final_song_ft.iloc[start_idx:start_idx+batch_size-1]["track_id"]]
                    
    # upload
    index.upsert(vectors=zip(ids, vecs, metadata))


  0%|          | 0/4500 [00:00<?, ?it/s]

In [24]:
index.describe_index_stats()

{'dimension': 2407,
 'index_fullness': 0.5,
 'namespaces': {'': {'vector_count': 445508}},
 'totalVectorCount': 445508.0}

## 6. Get User Recommendations

In [82]:
index.query(
    vector=playlist_vec, #vector to find similar vectors for 
    top_k=15, #number of results to return 
    filter={ 
        "track_id" : {"$nin" : playlist_songids} #filter for songs that are NOT in the playlist already
    },
    include_metadata = True
)

{'matches': [{'id': '1QV6tiMFM6fSOKOGLMHYYg',
              'metadata': {'track_id': '1QV6tiMFM6fSOKOGLMHYYg'},
              'score': 0.940154672,
              'values': []},
             {'id': '1pXrR5Y9OgcIV2JEAl2lCB',
              'metadata': {'track_id': '1pXrR5Y9OgcIV2JEAl2lCB'},
              'score': 0.939974368,
              'values': []},
             {'id': '0cqRj7pUJDkTCEsJkx8snD',
              'metadata': {'track_id': '0cqRj7pUJDkTCEsJkx8snD'},
              'score': 0.939339578,
              'values': []},
             {'id': '1GEBsLDvJGw7kviySRI6GX',
              'metadata': {'track_id': '1GEBsLDvJGw7kviySRI6GX'},
              'score': 0.937787592,
              'values': []},
             {'id': '1u8c2t2Cy7UBoG4ArRcF5g',
              'metadata': {'track_id': '1u8c2t2Cy7UBoG4ArRcF5g'},
              'score': 0.936589837,
              'values': []},
             {'id': '10nqz67NQWWa7XPq7ycihi',
              'metadata': {'track_id': '10nqz67NQWWa7XPq7ycihi'},
   

In [88]:
sp.tracks(['2IprIjGNRlj3TfqUWCAo0C'])

{'tracks': [{'album': {'album_type': 'single',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/06HL4z0CvFAxyc27GXpf02'},
      'href': 'https://api.spotify.com/v1/artists/06HL4z0CvFAxyc27GXpf02',
      'id': '06HL4z0CvFAxyc27GXpf02',
      'name': 'Taylor Swift',
      'type': 'artist',
      'uri': 'spotify:artist:06HL4z0CvFAxyc27GXpf02'}],
    'available_markets': ['AD',
     'AE',
     'AG',
     'AL',
     'AM',
     'AO',
     'AR',
     'AT',
     'AU',
     'AZ',
     'BA',
     'BB',
     'BD',
     'BE',
     'BF',
     'BG',
     'BH',
     'BI',
     'BJ',
     'BN',
     'BO',
     'BR',
     'BS',
     'BT',
     'BW',
     'BY',
     'BZ',
     'CA',
     'CD',
     'CG',
     'CH',
     'CI',
     'CL',
     'CM',
     'CO',
     'CR',
     'CV',
     'CW',
     'CY',
     'CZ',
     'DE',
     'DJ',
     'DK',
     'DM',
     'DO',
     'DZ',
     'EC',
     'EE',
     'EG',
     'ES',
     'FI',
     'FJ',
     'FM',
     'FR',
     'GA',