In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler


from sklearn.metrics import silhouette_samples, silhouette_score
from pathlib import Path  



In [2]:
# read in file
file1 = 'resources/tracks_df.csv'
tracks_df = pd.read_csv(file1)

In [3]:
tracks_df.head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2G0GextMwZJLkNxcSZ7ZJ3,(What A) Wonderful World - Mono,67,128787,0,['Sam Cooke'],1960-02-01,0.686,0.672,11,-5.523,1,0.0323,0.7,0.0,0.135,0.857,128.55
1,3oAWTk92mZBxKBOKf8mR5v,Summertime Blues,64,119360,0,['Eddie Cochran'],1960-05-01,0.714,0.886,11,-8.629,0,0.0554,0.116,0.184,0.18,0.954,156.351
2,2x6pbpjVGjiWCcH89IK8AX,Breaking Up Is Hard to Do,63,139200,0,['Neil Sedaka'],1960-12-30,0.743,0.799,8,-5.466,0,0.0375,0.699,0.0,0.0635,0.965,116.112
3,47mA6f44zxLtdATOoY7GjN,Georgia on My Mind - Original Master Recording,61,217415,0,['Ray Charles'],1960-09-01,0.138,0.399,7,-8.756,1,0.0311,0.782,4e-06,0.188,0.296,179.93
4,0DICNd5XQ1og9UeYzxoNFV,Baby (You've Got What It Takes),60,165760,0,"['Dinah Washington', 'Brook Benton']",1960-07-05,0.67,0.596,3,-9.347,1,0.0627,0.852,0.00203,0.653,0.813,133.396


In [4]:
tracks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509268 entries, 0 to 509267
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                509268 non-null  object 
 1   name              509268 non-null  object 
 2   popularity        509268 non-null  int64  
 3   duration_ms       509268 non-null  int64  
 4   explicit          509268 non-null  int64  
 5   artists           509268 non-null  object 
 6   release_date      509268 non-null  object 
 7   danceability      509268 non-null  float64
 8   energy            509268 non-null  float64
 9   key               509268 non-null  int64  
 10  loudness          509268 non-null  float64
 11  mode              509268 non-null  int64  
 12  speechiness       509268 non-null  float64
 13  acousticness      509268 non-null  float64
 14  instrumentalness  509268 non-null  float64
 15  liveness          509268 non-null  float64
 16  valence           50

In [5]:
tracks_df.shape[0]

509268

In [6]:
tracks_df=tracks_df.drop_duplicates(subset=['name'])

In [7]:
tracks_df.shape[0]

391661

In [8]:
tracks_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2G0GextMwZJLkNxcSZ7ZJ3,(What A) Wonderful World - Mono,67,128787,0,['Sam Cooke'],1960-02-01,0.686,0.672,11,-5.523,1,0.0323,0.7,0.0,0.135,0.857,128.55
1,3oAWTk92mZBxKBOKf8mR5v,Summertime Blues,64,119360,0,['Eddie Cochran'],1960-05-01,0.714,0.886,11,-8.629,0,0.0554,0.116,0.184,0.18,0.954,156.351
2,2x6pbpjVGjiWCcH89IK8AX,Breaking Up Is Hard to Do,63,139200,0,['Neil Sedaka'],1960-12-30,0.743,0.799,8,-5.466,0,0.0375,0.699,0.0,0.0635,0.965,116.112
3,47mA6f44zxLtdATOoY7GjN,Georgia on My Mind - Original Master Recording,61,217415,0,['Ray Charles'],1960-09-01,0.138,0.399,7,-8.756,1,0.0311,0.782,4e-06,0.188,0.296,179.93
4,0DICNd5XQ1og9UeYzxoNFV,Baby (You've Got What It Takes),60,165760,0,"['Dinah Washington', 'Brook Benton']",1960-07-05,0.67,0.596,3,-9.347,1,0.0627,0.852,0.00203,0.653,0.813,133.396


In [9]:
top_tracks_df = tracks_df[tracks_df["popularity"]>= 80]

In [10]:
top_tracks_df.head()


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
4382,745H5CctFr12Mo7cqa1BMH,My Girl,80,165000,0,['The Temptations'],1965-03-22,0.572,0.418,0,-10.738,1,0.0349,0.635,0.0,0.0961,0.694,104.566
6382,7tqhbajSfrz2F7E1Z75ASX,Ain't No Mountain High Enough,82,151667,0,"['Marvin Gaye', 'Tammi Terrell']",1967-08-29,0.663,0.6,7,-10.87,1,0.032,0.43,0.0,0.184,0.8,129.991
6383,3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,80,183307,0,['Van Morrison'],1967-09-01,0.491,0.583,7,-10.964,1,0.0376,0.182,0.0,0.407,0.907,150.572
8372,6dGnYIeXmHdcikdzNNDMm2,Here Comes The Sun - Remastered 2009,83,185733,0,['The Beatles'],1969-09-26,0.557,0.54,9,-10.484,1,0.0347,0.0339,0.00248,0.179,0.394,129.171
8373,4BP3uh0hFLFRb5cjsgLqDh,Fortunate Son,83,140773,0,['Creedence Clearwater Revival'],1969-11-02,0.64,0.663,0,-7.516,1,0.0374,0.201,0.00806,0.152,0.663,132.77


In [12]:
filepath = Path('Resources/KNN_top_tracks.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
top_tracks_df.to_csv(filepath)  


In [30]:
file2 = 'resources/KNN_top_tracks.csv'
KNN_top_tracks = pd.read_csv(file2)

In [31]:
KNN_top_tracks.head()

Unnamed: 0.1,Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,4382,745H5CctFr12Mo7cqa1BMH,My Girl,80,165000,0,['The Temptations'],1965-03-22,0.572,0.418,0,-10.738,1,0.0349,0.635,0.0,0.0961,0.694,104.566
1,6382,7tqhbajSfrz2F7E1Z75ASX,Ain't No Mountain High Enough,82,151667,0,"['Marvin Gaye', 'Tammi Terrell']",1967-08-29,0.663,0.6,7,-10.87,1,0.032,0.43,0.0,0.184,0.8,129.991
2,6383,3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,80,183307,0,['Van Morrison'],1967-09-01,0.491,0.583,7,-10.964,1,0.0376,0.182,0.0,0.407,0.907,150.572
3,8372,6dGnYIeXmHdcikdzNNDMm2,Here Comes The Sun - Remastered 2009,83,185733,0,['The Beatles'],1969-09-26,0.557,0.54,9,-10.484,1,0.0347,0.0339,0.00248,0.179,0.394,129.171
4,8373,4BP3uh0hFLFRb5cjsgLqDh,Fortunate Son,83,140773,0,['Creedence Clearwater Revival'],1969-11-02,0.64,0.663,0,-7.516,1,0.0374,0.201,0.00806,0.152,0.663,132.77


In [32]:
KNN_top_tracks = KNN_top_tracks.drop(['Unnamed: 0'], axis=1)

In [33]:
KNN_top_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                782 non-null    object 
 1   name              782 non-null    object 
 2   popularity        782 non-null    int64  
 3   duration_ms       782 non-null    int64  
 4   explicit          782 non-null    int64  
 5   artists           782 non-null    object 
 6   release_date      782 non-null    object 
 7   danceability      782 non-null    float64
 8   energy            782 non-null    float64
 9   key               782 non-null    int64  
 10  loudness          782 non-null    float64
 11  mode              782 non-null    int64  
 12  speechiness       782 non-null    float64
 13  acousticness      782 non-null    float64
 14  instrumentalness  782 non-null    float64
 15  liveness          782 non-null    float64
 16  valence           782 non-null    float64
 1

In [34]:
KNN_top_tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,745H5CctFr12Mo7cqa1BMH,My Girl,80,165000,0,['The Temptations'],1965-03-22,0.572,0.418,0,-10.738,1,0.0349,0.635,0.0,0.0961,0.694,104.566
1,7tqhbajSfrz2F7E1Z75ASX,Ain't No Mountain High Enough,82,151667,0,"['Marvin Gaye', 'Tammi Terrell']",1967-08-29,0.663,0.6,7,-10.87,1,0.032,0.43,0.0,0.184,0.8,129.991
2,3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,80,183307,0,['Van Morrison'],1967-09-01,0.491,0.583,7,-10.964,1,0.0376,0.182,0.0,0.407,0.907,150.572
3,6dGnYIeXmHdcikdzNNDMm2,Here Comes The Sun - Remastered 2009,83,185733,0,['The Beatles'],1969-09-26,0.557,0.54,9,-10.484,1,0.0347,0.0339,0.00248,0.179,0.394,129.171
4,4BP3uh0hFLFRb5cjsgLqDh,Fortunate Son,83,140773,0,['Creedence Clearwater Revival'],1969-11-02,0.64,0.663,0,-7.516,1,0.0374,0.201,0.00806,0.152,0.663,132.77


In [None]:
#tracks_index_df = tracks_df

In [None]:
#tracks_index_df.head()

In [None]:
#tracks_df2 = tracks_df
#top_tracks_df2 = tracks_df2

In [None]:
#tracks_df2.head()

In [None]:
#tracks_df.index = tracks_df['id']

In [None]:
#tracks_df = tracks_df.drop(['id'], axis=1)

In [None]:
#tracks_df.head()

In [None]:
#top_tracks_df = tracks_df

In [None]:
#top_tracks_df.head()

In [35]:
KNN_top_tracks_proc=KNN_top_tracks.drop(['energy','key','mode','speechiness','acousticness','instrumentalness','liveness', 'valence', 'release_date'], axis=1)

In [36]:
KNN_top_tracks_proc

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,danceability,loudness,tempo
0,745H5CctFr12Mo7cqa1BMH,My Girl,80,165000,0,['The Temptations'],0.572,-10.738,104.566
1,7tqhbajSfrz2F7E1Z75ASX,Ain't No Mountain High Enough,82,151667,0,"['Marvin Gaye', 'Tammi Terrell']",0.663,-10.870,129.991
2,3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,80,183307,0,['Van Morrison'],0.491,-10.964,150.572
3,6dGnYIeXmHdcikdzNNDMm2,Here Comes The Sun - Remastered 2009,83,185733,0,['The Beatles'],0.557,-10.484,129.171
4,4BP3uh0hFLFRb5cjsgLqDh,Fortunate Son,83,140773,0,['Creedence Clearwater Revival'],0.640,-7.516,132.770
...,...,...,...,...,...,...,...,...,...
777,1yjY7rpaAQvKwpdUliHx0d,Still into You,80,216013,0,['Paramore'],0.602,-3.763,136.010
778,06KyNuuMOX1ROXRhj787tj,We Don't Talk Anymore (feat. Selena Gomez),80,217707,0,"['Charlie Puth', 'Selena Gomez']",0.728,-8.053,100.017
779,26wLOs3ZuHJa2Ihhx6QIE6,Teeth,80,204887,0,['5 Seconds of Summer'],0.749,-2.961,139.020
780,4umIPjkehX1r7uhmGvXiSV,Intentions (feat. Quavo),81,212867,0,"['Justin Bieber', 'Quavo']",0.806,-6.637,147.986


In [37]:
KNN_top_tracks_proc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            782 non-null    object 
 1   name          782 non-null    object 
 2   popularity    782 non-null    int64  
 3   duration_ms   782 non-null    int64  
 4   explicit      782 non-null    int64  
 5   artists       782 non-null    object 
 6   danceability  782 non-null    float64
 7   loudness      782 non-null    float64
 8   tempo         782 non-null    float64
dtypes: float64(3), int64(3), object(3)
memory usage: 55.1+ KB


In [None]:
#proctop_tracks_df = top_tracks_df.drop(['name','artists'], axis=1)

In [None]:
#top_tracks_df.info()

In [None]:
#top_tracks_df.head()

In [38]:
KNN_top_tracks_proc["popularity"] = KNN_top_tracks_proc["popularity"].astype(float)
KNN_top_tracks_proc["duration_ms"] = KNN_top_tracks_proc["duration_ms"].astype(float)
KNN_top_tracks_proc["explicit"] = KNN_top_tracks_proc["explicit"].astype(float)

In [39]:
KNN_top_tracks_proc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            782 non-null    object 
 1   name          782 non-null    object 
 2   popularity    782 non-null    float64
 3   duration_ms   782 non-null    float64
 4   explicit      782 non-null    float64
 5   artists       782 non-null    object 
 6   danceability  782 non-null    float64
 7   loudness      782 non-null    float64
 8   tempo         782 non-null    float64
dtypes: float64(6), object(3)
memory usage: 55.1+ KB


In [40]:
KNN_top_tracks_proc.columns

Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'danceability', 'loudness', 'tempo'],
      dtype='object')

In [41]:
KNN_top_tracks_proc.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,danceability,loudness,tempo
0,745H5CctFr12Mo7cqa1BMH,My Girl,80.0,165000.0,0.0,['The Temptations'],0.572,-10.738,104.566
1,7tqhbajSfrz2F7E1Z75ASX,Ain't No Mountain High Enough,82.0,151667.0,0.0,"['Marvin Gaye', 'Tammi Terrell']",0.663,-10.87,129.991
2,3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,80.0,183307.0,0.0,['Van Morrison'],0.491,-10.964,150.572
3,6dGnYIeXmHdcikdzNNDMm2,Here Comes The Sun - Remastered 2009,83.0,185733.0,0.0,['The Beatles'],0.557,-10.484,129.171
4,4BP3uh0hFLFRb5cjsgLqDh,Fortunate Son,83.0,140773.0,0.0,['Creedence Clearwater Revival'],0.64,-7.516,132.77


In [42]:
KNN_top_tracks_proc[['popularity', 'danceability', 'loudness', 'tempo']]= StandardScaler().fit_transform(KNN_top_tracks_proc[['popularity', 'danceability', 'loudness', 'tempo']])


In [43]:
KNN_top_tracks_proc.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,danceability,loudness,tempo
0,745H5CctFr12Mo7cqa1BMH,My Girl,-0.934751,165000.0,0.0,['The Temptations'],-0.737319,-1.699283,-0.569155
1,7tqhbajSfrz2F7E1Z75ASX,Ain't No Mountain High Enough,-0.326112,151667.0,0.0,"['Marvin Gaye', 'Tammi Terrell']",-0.1021,-1.750395,0.29096
2,3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,-0.934751,183307.0,0.0,['Van Morrison'],-1.302734,-1.786792,0.987206
3,6dGnYIeXmHdcikdzNNDMm2,Here Comes The Sun - Remastered 2009,-0.021793,185733.0,0.0,['The Beatles'],-0.842026,-1.600933,0.26322
4,4BP3uh0hFLFRb5cjsgLqDh,Fortunate Son,-0.021793,140773.0,0.0,['Creedence Clearwater Revival'],-0.26265,-0.451699,0.384973


In [44]:
from sklearn.neighbors import NearestNeighbors

In [45]:
model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute', n_neighbors = 11)


In [55]:
idx = KNN_top_tracks_proc['id'].iloc[['15']]

In [56]:
idx

15    6A9mKXlFRPMPem6ygQSt7z
Name: id, dtype: object

In [60]:
def recommender(idx, model_knn):
    model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute', n_neighbors = 1)
    model_knn.fit(KNN_top_tracks_proc)
    
    
    distances, indices = model_knn.kneighbors(KNN_top_tracks_proc, 1)
    
    
    
    
    for i in indices:
        print(KNN_top_tracks[['name','artists']].loc.where(KNN_top_tracks[index]!=idx))

In [61]:
recommender(idx, model_knn)

ValueError: could not convert string to float: '745H5CctFr12Mo7cqa1BMH'