# Spotify Music Recommender System Part 2

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from key import cid, secret

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

import spotipy
from spotipy.oauth2 import SpotifyOAuth, SpotifyClientCredentials

import pinecone
from tqdm._tqdm_notebook import tqdm_notebook

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


## 2. Read Data

Let's get the merged and cleaned data from the previous notebook.

In [2]:
all_song_ft = pd.read_pickle("../data/all_song_ft.pkl")
final_song_ft = pd.read_pickle("../data/final_song_ft.pkl")

In [3]:
display(final_song_ft.shape)
final_song_ft.head()

(450000, 2408)

Unnamed: 0,track_id,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,zikir,zillertal,zim,zither,zolo,zouglou,zouk,zuliana,zurich,zydeco
0,1qFqi7NahjuFxGV3H2FNmb,0.16,0.468214,0.337,0.454545,0.740103,1.0,0.052317,0.696787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4iUxT3nhVmAsUj7Jm5ZeJy,0.19,0.162462,0.267,0.818182,0.663011,1.0,0.03862,0.928715,0.979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18Gd6vSbxKS4mrEvjz14oS,0.15,0.383451,0.297,0.181818,0.667875,0.0,0.033368,0.39257,1.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,32QnXosZq7A11knnBAEqk7,0.59,0.717457,0.315,0.0,0.746283,1.0,0.035015,0.74498,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0JBjiB2QZSYPVUEnUmSroj,0.38,0.507568,0.928,0.0,0.835934,1.0,0.03275,0.002912,1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5. Create User Music Taste Vector 

In this section, we create a user vector based on a given Spotify playlist in order to find similar songs to recommend. In order to get this user vector, we need a Spotify playlist URL, find the songs in this playlist using the Spotify API and create a vector for the songs in the playlist **that are also in our database**.

In [4]:
# Connect to Spotify API
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [5]:
# Each 'item' in 'items' is a track, each track is a dictionary
song_lst = sp.playlist_tracks('https://open.spotify.com/playlist/3URf6gpqnl83P7A28iHfTf?si=e78625adf8fa42a9')['items']

In [6]:
song_lst[0]['track']['id']

'3m7IYU7ySuFMwdm7OCShQN'

In [7]:
# Define a function that takes in a list of dict where each dict is a song that a user listened to
# and returns a summarized vector for the songs that exist in the original Spotify dataset
# song_lst = sp.playlist_tracks(<playlist_url>)['items']
def song_vec(song_lst):
    # Store all track_ids in a list 
    lst = [] 
    for song in song_lst:
        track_id = song['track']['id']
        lst.append(track_id)
        
    # Create a df for all songs in the playlist that exist in the Spotify dataset     
    song_df = all_song_ft[all_song_ft['track_id'].isin(lst)] ## should I change this to all_song_ft instead?
    
    # Summarize the playlist into a single vector by taking the mean of all songs in the playlist 
    playlist_vec = song_df.iloc[:, 1:].mean().tolist()
    playlist_songids = lst
    
    return playlist_vec, playlist_songids

In [8]:
playlist_vec = song_vec(song_lst)[0]
playlist_songids = song_vec(song_lst)[1]

## 7. Set up Pinecone Database

In this section, we set up the Pinecone index and upload our data to Pinecone in batches. For more information, check out Pinecone [quickstart guide](https://www.pinecone.io/docs/quickstart/).

The code that uploads the data to Pinecone has been commented out as the data only needs to be uploaded once.

### Initialize pinecone

In [9]:
# Initialize pinecone, make sure API key works
pinecone.init(api_key='3b9bd9d9-88f0-4911-a7e9-d589e7447fa1', 
             environment="us-west1-gcp")
index = pinecone.Index("spotifyv1")

In [10]:
# # Create new index
# pinecone.create_index('spotifyv1', dimension=2407, metric='cosine')

In [11]:
# # Upload the data
# batch_size = 100

# for start_idx in tqdm_notebook(range(0, len(final_song_ft), batch_size)):
#     ids = final_song_ft.iloc[start_idx:start_idx+batch_size-1]["track_id"].tolist()
#     # vecs = final_song_ft.iloc[start_idx:start_idx+batch_size-1]["vector"].tolist()
#     vecs = final_song_ft.iloc[start_idx:start_idx+batch_size-1].drop(columns=['track_id']).values.tolist()
#     metadata = [{"track_id" : track_id} for track_id in final_song_ft.iloc[start_idx:start_idx+batch_size-1]["track_id"]]
                    
#     # upload
#     index.upsert(vectors=zip(ids, vecs, metadata))

In [12]:
index.describe_index_stats()

{'dimension': 2407,
 'index_fullness': 0.5,
 'namespaces': {'': {'vector_count': 445508}},
 'totalVectorCount': 445508.0}

## 6. Get User Recommendations
In this section, we query the Pinecone database to get the top 15 most similar songs based on the playlist vector. After this, we've got all the building blocks for our web app!

In [13]:
index.query(
    vector=playlist_vec, #vector to find similar vectors for 
    top_k=15, #number of results to return 
    filter={ 
        "track_id" : {"$nin" : playlist_songids} #filter for songs that are NOT in the playlist already
    },
    include_metadata = True
)

{'matches': [{'id': '1QV6tiMFM6fSOKOGLMHYYg',
              'metadata': {'track_id': '1QV6tiMFM6fSOKOGLMHYYg'},
              'score': 0.940154672,
              'values': []},
             {'id': '1pXrR5Y9OgcIV2JEAl2lCB',
              'metadata': {'track_id': '1pXrR5Y9OgcIV2JEAl2lCB'},
              'score': 0.939974368,
              'values': []},
             {'id': '0cqRj7pUJDkTCEsJkx8snD',
              'metadata': {'track_id': '0cqRj7pUJDkTCEsJkx8snD'},
              'score': 0.939339578,
              'values': []},
             {'id': '1GEBsLDvJGw7kviySRI6GX',
              'metadata': {'track_id': '1GEBsLDvJGw7kviySRI6GX'},
              'score': 0.937787592,
              'values': []},
             {'id': '1u8c2t2Cy7UBoG4ArRcF5g',
              'metadata': {'track_id': '1u8c2t2Cy7UBoG4ArRcF5g'},
              'score': 0.936589837,
              'values': []},
             {'id': '10nqz67NQWWa7XPq7ycihi',
              'metadata': {'track_id': '10nqz67NQWWa7XPq7ycihi'},
   