In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df_kaggle = pd.read_csv("../data/music-dataset-1950-to-2019/tcc_ceds_music.csv")
columns = ["artist_name", "track_name", "release_date"]
df_kaggle = df_kaggle[columns]

In [4]:
df_kaggle.sample(2)

Unnamed: 0,artist_name,track_name,release_date
18148,fatback band,mister bass man,1974
4893,thursday,standing on the edge of summer,2001


Spotify dataframe has 500k songs. Initially, try to include only 50k songs on training.
Split the data with k-fold stratified algorithm, according to the artist column for even distribution 

In [6]:
df_spotify = pd.read_csv("../data/spotify_track_ids.csv")
df_spotify = df_spotify.drop("Unnamed: 0", axis=1)
df_spotify = df_spotify.rename(columns={"Song_title": "track_name", "Artist": "artist_name"})
df_spotify.track_name = df_spotify.track_name.astype(str)
df_spotify.track_name = df_spotify.track_name.apply(lambda x: x.lower())
df_spotify.artist_name = df_spotify.artist_name.astype(str)
df_spotify.artist_name = df_spotify.artist_name.apply(lambda x: x.lower())

In [19]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=421, shuffle=True)

In [20]:
for i, (train_ind, test_ind) in enumerate(skf.split(df_spotify.track_name, df_spotify.artist_name)):
    #print(train_ind, test_ind)
    df_spotify.loc[test_ind,"fold"] = i

df_spotify.fold = df_spotify.fold.astype(int)



In [21]:
df_spotify.sample(4)

Unnamed: 0,track_name,Spotify_track_id,artist_name,fold
396501,virgen de la candelaria,1OEr836z8yjUr53GIhZbKf,la sonora dinamita,3
48816,she’s gone dub,7vctZxSB4Sbw73j0l0rjRc,bob marley & the wailers,3
350447,covered in diamonds,5PhCT5CmWPEivNgoJiyqAy,famous dex,2
222867,días extraños - reprise,64GjpPIcWyR3ExCWKUv2Rw,bunbury,0


Concatenate Kaggle and Spotify Dataframes

In [22]:
df_spot_fold0 = df_spotify[df_spotify.fold == 0]
df_spot_fold0 = df_spot_fold0[["track_name", "artist_name"]].reset_index(drop=True)
len(df_spot_fold0)

102665

In [23]:
df = pd.concat([df_kaggle, df_spot_fold0],axis=0)
df = df[["artist_name", "track_name"]]
df = df.drop_duplicates()
len(df)

129306

In [24]:
df = df.reset_index(drop=True)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = CountVectorizer(max_features=15000)

count_matrix = vectorizer.fit_transform(df.track_name)

In [26]:
count_matrix = count_matrix.astype(np.int16)

In [27]:
cosine_sim = cosine_similarity(count_matrix, count_matrix, dense_output=False)

In [29]:
def get_recommendation_title(df_index) :
    ind1 = np.argsort(cosine_sim[df_index].toarray()[0])[::-1][:10]
    return df.loc[ind1]

Try some random examples (the first row of the results is the song itself)

In [30]:
ind = 820
song=df.loc[ind]["track_name"]
singer=df.loc[ind]["artist_name"]
print(singer, "  ", song)
get_recommendation_title(ind)

simon & garfunkel    he was my brother


Unnamed: 0,artist_name,track_name
820,simon & garfunkel,he was my brother
875,paul simon,he was my brother
122947,randy travis,"he's my rock, my sword, my shield"
8024,bob wills,my adobe hacienda
26617,arctic monkeys,my propeller
90606,brett eldredge,brother
107974,stereophonics,brother
1616,bread,she was my lady
54812,joey bada$$,my yout
44826,fetty wap,my environment


Save the model file to pickle

In [32]:
model_dict=dict()
model_dict["vectorizer"] = vectorizer
model_dict["dataframe"] = df
model_dict["count_matrix"] = count_matrix

In [34]:
folder = "../data/ML_models/"
filename = 'RecSys_track_name.pkl'

In [36]:
with open(folder+filename, 'wb') as file:
    # Serialize the dictionary and save it to the file
    pickle.dump(model_dict, file)