In [1]:
# 1. Audio Features:
# These features capture the musical characteristics of songs and are usually extracted from the audio signal. You can obtain these features from audio analysis libraries like Librosa or from music streaming platforms' APIs like Spotify's.

# Tempo: The beats per minute (BPM) of the song.
# Loudness: The overall volume of the song.
# Danceability: How suitable the song is for dancing based on rhythm and tempo.
# Energy: The intensity and activity level of the song.
# Acousticness: The amount of acoustic elements in the song.
# Instrumentalness: The likelihood of the song being instrumental.
# Valence: The positivity or happiness of the song.
# Speechiness: The presence of spoken words in the song.
# Key: The musical key of the song.
# Mode: Major or minor key.
# Duration: Length of the song in seconds.
    
# 2. Metadata Features:
# These features provide information about the song, such as artist, genre, release year, and more.

# Artist: The artist or band that performed the song.
# Album: The album the song belongs to.
# Genre: The genre(s) associated with the song.
# Release Year: The year the song was released.
# Language: The language of the song's lyrics.
# Popularity: The popularity rating of the song on platforms like Spotify.
# Explicit Content: Whether the song contains explicit content.

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('data.csv')
print(df.shape)
df.tail()

(170653, 19)


Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
170648,0.608,2020,0.0846,"['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...",0.786,301714,0.808,0,0KkIkfsLEJbrcIhYsCL7L5,0.000289,7,0.0822,-3.702,1,China,72,2020-05-29,0.0881,105.029
170649,0.734,2020,0.206,['Ashnikko'],0.717,150654,0.753,0,0OStKKAuXlxA0fMH54Qs6E,0.0,7,0.101,-6.02,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936
170650,0.637,2020,0.101,['MAMAMOO'],0.634,211280,0.858,0,4BZXVFYCb76Q0Klojq4piV,9e-06,4,0.258,-2.226,0,AYA,76,2020-11-03,0.0809,91.688
170651,0.195,2020,0.00998,['Eminem'],0.671,337147,0.623,1,5SiZJoLXp3WOl3J4C8IK0d,8e-06,2,0.643,-7.161,1,Darkness,70,2020-01-17,0.308,75.055
170652,0.642,2020,0.132,"['KEVVO', 'J Balvin']",0.856,189507,0.721,1,7HmnJHfs0BkFzX4x8j0hkl,0.00471,7,0.182,-4.928,1,Billetes Azules (with J Balvin),74,2020-10-16,0.108,94.991


In [5]:
df = df[df['year']>2000]

In [6]:
df['artists']

15806                                   ['Jimmy Eat World']
15807                                             ['Train']
15808                                        ['Papa Roach']
15809                                            ['Weezer']
15810                                  ['System Of A Down']
                                ...                        
170648    ['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...
170649                                         ['Ashnikko']
170650                                          ['MAMAMOO']
170651                                           ['Eminem']
170652                                ['KEVVO', 'J Balvin']
Name: artists, Length: 39494, dtype: object

In [7]:
import random
seed_value = 42
pd.np.random.seed(seed_value)
dff = df.sample(n=5000,).reset_index()

In [8]:
dff.drop(['id','release_date','duration_ms'],axis=1,inplace=True)

In [9]:
dfff = dff

In [10]:
dff = dff[['valence','acousticness','artists','danceability','energy','explicit','instrumentalness','key','liveness','loudness','mode','popularity','speechiness','tempo']]

In [11]:
import ast
def fetch_list(strings):
    l=[]
    for i in ast.literal_eval(strings):
        l.append(i)
        return l

In [12]:
dff['artists'] = dff['artists'].apply(fetch_list)

In [13]:
dff['artists'] = dff['artists'].apply(lambda x: " ".join(x))

In [14]:
dff['artists'] = dff['artists'].apply(lambda x: x.lower())

In [15]:
dff['artists'].value_counts()

taylor swift        29
eminem              27
drake               27
jay-z               21
lil wayne           20
                    ..
miami horror         1
andré sobota         1
death grips          1
netta                1
imaginary future     1
Name: artists, Length: 2725, dtype: int64

In [16]:
dff['artists'].unique()

array(['ingrid michaelson', 'linkin park', 'youngboy never broke again',
       ..., 'tye tribbett', 'saves the day', 'imaginary future'],
      dtype=object)

In [17]:
dff['artists']=pd.factorize(dff['artists'])[0]

In [18]:
dff

Unnamed: 0,valence,acousticness,artists,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
0,0.305,0.7310,0,0.886,0.259,0,0.000005,5,0.1080,-9.246,1,54,0.0423,119.894
1,0.412,0.0252,1,0.654,0.660,0,0.037100,5,0.2780,-5.594,1,53,0.0283,108.039
2,0.566,0.1790,2,0.627,0.685,1,0.000000,1,0.1070,-6.296,0,59,0.2300,78.858
3,0.312,0.7930,3,0.425,0.294,0,0.000008,10,0.3670,-10.754,1,56,0.0498,186.027
4,0.542,0.0159,4,0.322,0.934,0,0.000000,8,0.4180,-4.066,1,44,0.1200,113.199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.079,0.1740,482,0.527,0.473,1,0.000008,11,0.1530,-8.709,0,54,0.0940,130.274
4996,0.570,0.3620,871,0.811,0.726,1,0.000002,2,0.1210,-6.726,1,64,0.3230,150.061
4997,0.654,0.2080,950,0.616,0.636,0,0.000000,1,0.5170,-7.055,1,59,0.0325,110.618
4998,0.298,0.8870,2724,0.637,0.129,0,0.000000,11,0.0719,-17.082,1,68,0.0346,125.802


In [19]:
dff['mode'] = dff['mode'].astype(bool)

In [20]:
dff['explicit'] = dff['explicit'].astype(bool)

In [21]:
from sklearn.metrics.pairwise import cosine_similarity


In [22]:
vectors = dff.to_numpy()

In [23]:
vectors

array([[0.305, 0.731, 0, ..., 54, 0.0423, 119.894],
       [0.412, 0.0252, 1, ..., 53, 0.0283, 108.039],
       [0.5660000000000001, 0.179, 2, ..., 59, 0.23, 78.858],
       ...,
       [0.654, 0.208, 950, ..., 59, 0.0325, 110.618],
       [0.298, 0.887, 2724, ..., 68, 0.0346, 125.802],
       [0.332, 0.496, 806, ..., 57, 0.0368, 143.713]], dtype=object)

In [24]:
similarity = cosine_similarity(vectors)

In [25]:
similarity.shape

(5000, 5000)

In [26]:
sorted(similarity[0],reverse=True)

[1.0,
 0.9999114873181338,
 0.9991907275195212,
 0.9991167376654243,
 0.9989530826689297,
 0.9987689784673304,
 0.998719458913734,
 0.9985501609041244,
 0.9985470891721717,
 0.9985079753192424,
 0.9983379210886784,
 0.9981804745071905,
 0.9981482819844755,
 0.997843024598019,
 0.9978195146456403,
 0.997777128182901,
 0.9976358908874972,
 0.9976248173939295,
 0.9973183538799333,
 0.9971803209858584,
 0.997120178957965,
 0.9971112925753907,
 0.9970930528166122,
 0.997010472162411,
 0.9969900427656387,
 0.996830480499979,
 0.9967916671565221,
 0.996572610363356,
 0.9961202076945416,
 0.9960204458197398,
 0.9960163783879308,
 0.9958563024180082,
 0.9957543200892504,
 0.9956570283405123,
 0.9956197590590243,
 0.9954717482572758,
 0.995421191973903,
 0.9952612281359349,
 0.995014979638808,
 0.994999844423125,
 0.9949553151519541,
 0.994876274467932,
 0.994701666044223,
 0.9945915395984426,
 0.9943806318615362,
 0.993996908192705,
 0.9938920256316898,
 0.9935296099116712,
 0.9933256013996891,

In [27]:
list(enumerate(similarity[0]))

[(0, 1.0),
 (1, 0.9991167376654243),
 (2, 0.9754991954434095),
 (3, 0.9911786290993434),
 (4, 0.9969900427656387),
 (5, 0.9985501609041244),
 (6, 0.9971803209858584),
 (7, 0.997777128182901),
 (8, 0.9926449006556133),
 (9, 0.9920048193815081),
 (10, 0.9960204458197398),
 (11, 0.9899324237531786),
 (12, 0.9897758678986847),
 (13, 0.9806643258171412),
 (14, 0.9917102865207947),
 (15, 0.991604773966501),
 (16, 0.9900791738441722),
 (17, 0.9906714391718778),
 (18, 0.9833387145644513),
 (19, 0.9874780251422834),
 (20, 0.9917438404655186),
 (21, 0.9792804162316625),
 (22, 0.9838568821719753),
 (23, 0.9905083296523618),
 (24, 0.9851729574800071),
 (25, 0.9749058775969904),
 (26, 0.9777633836321428),
 (27, 0.9859825831530851),
 (28, 0.9475577657279655),
 (29, 0.9489758819570907),
 (30, 0.9737146433636231),
 (31, 0.9692351599315547),
 (32, 0.9707639699237199),
 (33, 0.9289889843242708),
 (34, 0.9808979519622342),
 (35, 0.9517112045973607),
 (36, 0.9635049689544535),
 (37, 0.9669517684328558),
 

In [28]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:11]

[(4225, 0.9999114873181338),
 (1535, 0.9991907275195212),
 (1, 0.9991167376654243),
 (2209, 0.9989530826689297),
 (4286, 0.9987689784673304),
 (2954, 0.998719458913734),
 (5, 0.9985501609041244),
 (4416, 0.9985470891721717),
 (1828, 0.9985079753192424),
 (2006, 0.9983379210886784)]

In [29]:
dfff[dfff["name"]=='Sister'].index

Int64Index([4506], dtype='int64')

In [31]:
def recommend(song):
    song_index = dfff[dfff["name"]==song].index[0]
    distance = similarity[song_index]
    songs_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x:x[1])[1:11]
    for i in songs_list:
        print(dfff.iloc[i[0]]['name'])

In [32]:
dfff.tail()


Unnamed: 0,index,valence,year,acousticness,artists,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo
4995,73780,0.079,2013,0.174,['J. Cole'],0.527,0.473,1,8e-06,11,0.153,-8.709,0,Trouble,54,0.094,130.274
4996,154977,0.57,2018,0.362,"['Lil Jon', 'Offset', '2 Chainz']",0.811,0.726,1,2e-06,2,0.121,-6.726,1,Alive (with Offset & 2 Chainz),64,0.323,150.061
4997,107536,0.654,2014,0.208,['The Beatles'],0.616,0.636,0,0.0,1,0.517,-7.055,1,Yellow Submarine,59,0.0325,110.618
4998,74543,0.298,2017,0.887,"['Imaginary Future', 'Kina Grannis']",0.637,0.129,0,0.0,11,0.0719,-17.082,1,I Will Spend My Whole Life Loving You,68,0.0346,125.802
4999,91192,0.332,2015,0.496,['Chris Stapleton'],0.564,0.435,0,5e-06,0,0.0861,-9.387,1,Was It 26,57,0.0368,143.713


In [33]:
recommend('Bang Bang')

Crank That (Soulja Boy)
Wrapped Up (feat. Travie McCoy)
Me Encantaría
I Can Tell
Todo Porque Te Amo
I'd Rather
Paz en Este Amor
The Funeral
Bang My Head (feat. Sia & Fetty Wap)
Pursuit Of Happiness - Extended Steve Aoki Remix (Explicit)


In [34]:
recommend('Memories Are Made Of')

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
dfff[dfff['name']=='3 Strange Days']

In [None]:
dfff

In [36]:
import pickle

In [38]:
pickle.dump(dfff,open('songs.pkl','wb'))

In [39]:
pickle.dump(dff,open('data.pkl','wb'))

In [37]:
pickle.dump(similarity,open('similarity.pkl','wb'))