Implementing content based recommendation system using cosine similarity for
1. Recommendation for artists.
2. Recommendation for songs.

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_csv("artists.csv")
df_song = pd.read_csv('songs.csv')

Recommendation for artists
1. Gathered all the necessary features from the csv file.
2. Fetched and combined the selected features for all the rows in the dataframe.
3. Performed countvectorization for the complete dataframe
4. Fetched a cosine similarity for the matrix.
5. For each artist, a list of tuples are created with values such as (artist_index, similarity value).
6. The list is then sorted based on the user_choice for the artist index.
7. Top n number of artists which are similar, are displayed.

In [26]:
features = ['mode','duration_ms','instrumentalness','acousticness','danceability','energy','liveness','loudness','speechiness', 'tempo','valence']

def combine_all_features(row):
    return str(row['mode']) +" "+ str(row['duration_ms']) +" "+ str(row['instrumentalness']) +" "+ str(row['acousticness']) +" "+ str(row['danceability'])+" "+ str(row["energy"])+" "+str(row["liveness"])+" "+str(row["loudness"])+" "+str(row["speechiness"])+" "+str(row["tempo"])+" "+str(row["valence"])

df['combined_features'] = df.apply(combine_all_features,axis=1)


cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(count_matrix)


def fetch_artist_from_index(index):
    return df[df.index == index]["artists"].values[0]

def fetch_index_from_artist(artists):
    return df[df.artists == artists]["index"].values[0]

user_choice_for_singer = "Justin Timberlake"
artist_index = fetch_index_from_artist(user_choice_for_singer)

similar_artists =  list(enumerate(cosine_sim[artist_index]))

sim_artist_sort = sorted(similar_artists,key=lambda x:x[1],reverse=True)[1:]

i=0
print("Top 10 similar artists/singers to "+ user_choice_for_singer +" are:\n")
for element in sim_artist_sort:
    print(i+1,'->',fetch_artist_from_index(element[0]))
    i=i+1
    if i>=10:
        break


Top 10 similar artists/singers to Justin Timberlake are:

1 -> 9tails
2 -> Devour
3 -> Dyo
4 -> Eric Reprid
5 -> Jakkybo
6 -> Magic System
7 -> Methrone
8 -> NEIKED
9 -> Nick Swardson
10 -> Octavian


Recommendation for songs by applying the same technique

Removed some unnecessary list values for the artists and stored it in a seperate csv file.

In [27]:
import re
for index in range(len(df_song)):
    artists_list = df_song.iloc[index]['artists']
    list1 = re.findall('[\w ]+', artists_list)
    variables = []
    for val in list1:
        if re.search("^\s*$", val):
            pass
        else:
            variables.append(val)
    answer = ' '.join(variables)
    df_song.loc[index,'artists'] = answer
print(df_song)

       index  valence  year  acousticness                                       artists  danceability  duration_ms  energy  explicit  instrumentalness  liveness  loudness  mode                             name  popularity  speechiness    tempo
0          0    0.165  1921       0.96700                                  Frank Parker         0.275       210000   0.309         0          0.000028    0.3810    -9.316     1                        Danny Boy           3       0.0354  100.109
1          1    0.493  1921       0.99000                                       Georgel         0.315       190800   0.363         0          0.000000    0.2920   -12.562     0                        La Vipère           0       0.0546  174.532
2          2    0.493  1921       0.01750                                    Zay Gatsby         0.527       205072   0.691         1          0.384000    0.3580    -7.298     1                   Power Is Power           0       0.0326  159.935
3          3    0.664  1

Since the data is too huge to create a matrix for cosine similarity, I am removing some rows from the dataframe randomly.

In [28]:
import numpy as np
np.random.seed(10)

remove_n = 20000
drop_indices = np.random.choice(df_song.index, remove_n, replace=False)
df_song = df_song.drop(drop_indices)

df_song.index = range(len(df_song.index))

# df_song.to_csv('updated_songs2.csv')
print(df_song)

       index  valence  year  acousticness                                            artists  danceability  duration_ms  energy  explicit  instrumentalness  liveness  loudness  mode                                name  popularity  speechiness    tempo
0          2    0.493  1921       0.01750                                         Zay Gatsby         0.527       205072   0.691         1          0.384000    0.3580    -7.298     1                      Power Is Power           0       0.0326  159.935
1          3    0.664  1921       0.99600                    Hector Berlioz Arturo Toscanini         0.541       250747   0.283         0          0.898000    0.3930   -14.808     1                       Rákóczy March           0       0.0477  108.986
2          4    0.240  1921       0.99400                     John McCormack Edwin Schneider         0.400       187333   0.155         0          0.000043    0.1030   -13.976     1                      Mother Machree           0       0.0873  

In [29]:
df_song = pd.read_csv('updated_songs2.csv')
print(df_song)


       index  valence  year  acousticness                                            artists  danceability  duration_ms  energy  explicit  instrumentalness  liveness  loudness  mode                                name  popularity  speechiness    tempo
0          0    0.493  1921       0.01750                                         Zay Gatsby         0.527       205072   0.691         1          0.384000    0.3580    -7.298     1                      Power Is Power           0       0.0326  159.935
1          1    0.664  1921       0.99600                    Hector Berlioz Arturo Toscanini         0.541       250747   0.283         0          0.898000    0.3930   -14.808     1                       Rákóczy March           0       0.0477  108.986
2          2    0.240  1921       0.99400                     John McCormack Edwin Schneider         0.400       187333   0.155         0          0.000043    0.1030   -13.976     1                      Mother Machree           0       0.0873  

Implementing the same type of method, just working differently with different other features.

In [30]:
features = ['explicit','mode','instrumentalness','acousticness','danceability','energy',
            'liveness','loudness','speechiness', 'tempo','valence']

def combine_features(row):
    return str(row['explicit']) +" "+ str(row['mode']) +" "+ str(row['instrumentalness'])
    +" "+ str(row['acousticness']) +" "+ str(row['danceability'])+" "+ str(row["energy"])+" "+str(row["liveness"])
    +" "+str(row["loudness"])+" "+str(row["speechiness"])+" "+str(row["tempo"])+" "+str(row["valence"])


df_song['combined_features'] = df_song.apply(combine_features,axis=1)

cv = CountVectorizer()
count_matrix = cv.fit_transform(df_song['combined_features'])
cosine_sim = cosine_similarity(count_matrix)

def fetch_song_from_index(index):
    return df_song[df_song.index == index]["name"].values[0]

def fetch_index_from_song(name):
    return df_song[df_song.name == name]["index"].values[0]

user_choice_song = "Zor Laga Le"
song_index = fetch_index_from_song(user_choice_song)

similar_songs =  list(enumerate(cosine_sim[song_index]))

similar_songs_sorted = sorted(similar_songs,key=lambda x:x[1],reverse=True)[1:]

i=0
print("Top 10 similar songs to "+ user_choice_song +" are:\n")
for element in similar_songs_sorted:
    print(i+1,'->',fetch_song_from_index(element[0]))
    i=i+1
    if i>=10:
        break

Top 10 similar songs to Zor Laga Le are:

1 -> Crave You - Adventure Club Remix
2 -> Why Do You Love Me
3 -> Too Young
4 -> Te Diré
5 -> Walkin' The Floor Over You
6 -> Carry On (from the Original Motion Picture "POKÉMON Detective Pikachu")
7 -> Zor Laga Le
8 -> Turn Around
9 -> Paint It Black Medley: Black On Black In Black / Paint It Black / Laurel & Hardy / Pintello Negro / P.C.3 / Blackbird
10 -> Worlds to Run
