In [1]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
df_kaggle = pd.read_csv("../data/music-dataset-1950-to-2019/tcc_ceds_music.csv")
columns = ["artist_name", "track_name", "release_date"]
df_kaggle = df_kaggle[columns]

In [6]:
df_kaggle.sample(5)

Unnamed: 0,artist_name,track_name,release_date
18296,the crusaders,feel it,1977
9479,rickie lee jones,lush life,1983
14857,zz top,2000 blues,1990
15352,eric clapton,pilgrim,1998
4524,days of the new,shelf in the room,1997


Spotify dataframe has 500k songs. Initially, try to include only 50k songs on training.
Split the data with k-fold stratified algorithm, according to the artist column for even distribution 

In [8]:
df_spotify = pd.read_csv("../data/spotify_track_ids.csv")
df_spotify = df_spotify.drop("Unnamed: 0", axis=1)
df_spotify = df_spotify.rename(columns={"Song_title": "track_name", "Artist": "artist_name"})
df_spotify.track_name = df_spotify.track_name.astype(str)
df_spotify.track_name = df_spotify.track_name.apply(lambda x: x.lower())
df_spotify.artist_name = df_spotify.artist_name.astype(str)
df_spotify.artist_name = df_spotify.artist_name.apply(lambda x: x.lower())
df_spotify.head()

Unnamed: 0,track_name,Spotify_track_id,artist_name
0,one dance,1zi7xx7UVEFkmKfv06H8x0,drake
1,god's plan,6DCZcSspjsKoFjzjrWoCdn,drake
2,passionfruit,5mCPDVBb16L4XQwDdbRUpz,drake
3,work,72TFWvU3wUYdUuxejTTIzt,drake
4,in my feelings,2G7V7zsVDxg1yRsu7Ew9RJ,drake


In [9]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)
#splits=skf.split(df_spotify.track_name, df_spotify.artist_name)

In [10]:
for i, (train_ind, test_ind) in enumerate(skf.split(df_spotify.track_name, df_spotify.artist_name)):
    #print(train_ind, test_ind)
    df_spotify.loc[test_ind,"fold"] = i

df_spotify.fold = df_spotify.fold.astype(int)



In [11]:
df_spotify.sample(5)

Unnamed: 0,track_name,Spotify_track_id,artist_name,fold
300669,broche de oro - acústica,7vSuIZ04l6R4SoIK0Gp7Di,edwin luna y la trakalosa de monterrey,4
318463,sunday - tensnake remix,4rQIn2WK5rH9mnRhoKuI4S,foals,6
58990,lick it - datsik remix,6k5YEsbNtAbYprMHssqbQI,skrillex,8
423091,casting,42l1cUco5Rbju3D7JO0Ox7,kukon,1
357204,"respect (feat. juicy j, k camp)",3JijaBBcegOrE23liWUzMl,k camp,4


Concatenate Kaggle and Spotify Dataframes

In [12]:
df_spot_fold0 = df_spotify[df_spotify.fold == 0]
df_spot_fold0 = df_spot_fold0[["track_name", "artist_name"]].reset_index(drop=True)
len(df_spot_fold0)

51333

In [13]:
df = pd.concat([df_kaggle, df_spot_fold0],axis=0)
df = df[["artist_name", "track_name"]]
df = df.drop_duplicates()
len(df)

76810

In [14]:
df = df.reset_index(drop=True)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = CountVectorizer()

count_matrix = vectorizer.fit_transform(df.track_name)

In [16]:
count_matrix = count_matrix.astype(np.int16)

In [17]:
cosine_sim = cosine_similarity(count_matrix, count_matrix, dense_output=False)

In [18]:
def get_recommendation_title(df_index) :
    ind1 = np.argsort(cosine_sim[df_index].toarray()[0])[::-1][:10]
    return df.loc[ind1]

Try some random examples (the first row of the results is the song itself)

In [20]:
ind = 820
song=df.loc[ind]["track_name"]
singer=df.loc[ind]["artist_name"]
print(singer, "  ", song)
get_recommendation_title(ind)

simon & garfunkel    he was my brother


Unnamed: 0,artist_name,track_name
820,simon & garfunkel,he was my brother
875,paul simon,he was my brother
76281,the hollies,he ain't heavy he's my brother - 1998 remaster
25407,alice in chains,brother
72945,kizz daniel,my g
44070,kodaline,brother
34616,troye sivan,my my my!
61701,needtobreathe,brother
6374,francesca battistelli,he knows my name
36734,mac demarco,brother
