%pip install numpy --upgrade
%pip install pandas --upgrade
%pip install seaborn --upgrade
%pip install plotly --upgrade
%pip install matplotlib --upgrade
%pip install scikit-learn --upgrade
%pip install scipy --upgrade
%pip install collections --upgrade
%pip install spotipy --upgrade
%pip install pickle --upgrade

In [1]:
import os
import numpy as np
import pandas as pd
import spotipy
import pyodbc
import pickle

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from collections import defaultdict
from spotipy.oauth2 import SpotifyClientCredentials

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open('data/spotipyclientid.txt', 'r') as f:
    SPOTIPY_CLIENT_ID = f.read()
with open('data/spotipyclientsecret.txt', 'r') as f:
    SPOTIPY_CLIENT_SECRET = f.read()
    
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET))

In [3]:
with open('data/db_connection.txt', 'r') as f:
    db_conn = f.read()

with pyodbc.connect(db_conn) as conn:
    with conn.cursor() as cursor:
        cursor.execute("SELECT TOP (1000) [PersonID], [FirstName], [LastName], [Username], [Email], [Address], [Age], [Password] FROM [dbo].[Accounts]")
        row = cursor.fetchone()
        while row:
            print (str(row[0]) + " " + str(row[1]))
            row = cursor.fetchone()

10000001 Alan
10000002 Arturo
10000003 Alfredo
10000004 Elijah
10000005 Michael
10000006 Nicole
10000007 Tristan
10000013 None


In [4]:
data = pd.read_csv('data\data_features.csv')

In [5]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=10, 
                                   verbose=False))
                                 ], verbose=False)

In [6]:
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

pickle.dump(song_cluster_pipeline, open('data/data.sav', 'wb'))

song_cluster_pipeline = pickle.load(open('data/data.sav', 'rb'))

In [7]:
def find_song(name, artists, year):
    song_data = defaultdict()
    results = sp.search(q='track: {} artists: {} year: {}'.format(name,artists,year), limit=1, type='track')
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    song_data['track_number'] = [results['track_number']]
    song_data['disc_number'] = [results['disc_number']]


    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

number_cols = ['track_number', 'disc_number', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
               'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'year']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) & (spotify_data['artists'] == song['artists']) & (spotify_data['year'] == song['year'])].iloc[1]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['artists'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])

    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [8]:
[recommend_songs([{'name': 'How to Save A Life', 'artists': 'The Fray', 'year':2005},
                  {'name': 'If I Die Young', 'artists': 'The Band Perry', 'year':2010},
                  {'name': 'Somebody That I Used To Know', 'artists': "'Gotye', 'Kimbra'", 'year':2011}], data)]

[[{'name': 'No Words', 'year': 2007, 'artists': "['Helen Horal']"},
  {'name': 'Monkey',
   'year': 2007,
   'artists': "['Two Loons For Tea', 'Tom Biller', 'Paul Bushnell', 'Matt Chamberlain', 'Mike Dillon', 'Brad Houser', 'Brian MacLeod', 'Pat Mastelotto', 'Don McGreevy', 'Eric Rosse', 'Patrick Warren', 'Sarah Scott', 'Jonathan Kochmer', 'Mell Dettmer']"},
  {'name': 'For All That', 'year': 2006, 'artists': "['The Bordercollies']"},
  {'name': 'In the Beat of A Heart',
   'year': 2010,
   'artists': "['Stu Weaver']"},
  {'name': 'Pleasure and Pain and Pride and Me',
   'year': 2008,
   'artists': "['These United States']"},
  {'name': 'Losing You', 'year': 2009, 'artists': "['Francesca Lee']"},
  {'name': 'What Can I Do', 'year': 2005, 'artists': "['Nan Quan Mama']"},
  {'name': 'Gardens of England', 'year': 2010, 'artists': '["The BDI\'s"]'},
  {'name': 'Shadow Lie Light',
   'year': 2010,
   'artists': "['The Brian James Gang']"},
  {'name': 'My Patch', 'year': 2006, 'artists': "['

In [9]:
[recommend_songs([{'name': 'It Must Be A Pain', 'artists': "['sewerperson']", 'year':2022}],  data)]

[[{'name': 'Miserable Woman',
   'year': 2020,
   'artists': "['The Tony O Blues Band']"},
  {'name': 'Chanda Mama', 'year': 2018, 'artists': "['TULA']"},
  {'name': 'The World is in a Tangle',
   'year': 2019,
   'artists': "['Jontavious Willis']"},
  {'name': 'Raised on Texas Music',
   'year': 2017,
   'artists': "['Hot Pickin 57s']"},
  {'name': 'Blue Monday', 'year': 2019, 'artists': "['George Benson']"},
  {'name': 'Stegosaurus', 'year': 2016, 'artists': "['New Move']"},
  {'name': 'Werewolves of London',
   'year': 2017,
   'artists': "['Warren Zevon']"},
  {'name': 'Same Old Road', 'year': 2014, 'artists': "['Under Two Tables']"},
  {'name': 'Ayom Manifesto', 'year': 2020, 'artists': "['Ayom']"},
  {'name': 'Twenty-Seven Dollars',
   'year': 2018,
   'artists': "['Jenny Van West']"}]]