In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('song_data.xlsx', sep=";")
df.head()

In [None]:
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 

import warnings;
warnings.filterwarnings('ignore')

In [None]:
df.shape

In [None]:
# check for missing values
df.isnull().sum()

In [None]:
# remove missing values
df.dropna(inplace=True)

In [None]:
df['song_name']= df['song_name'].astype(str)

In [None]:
duration = df["song_duration_ms"].unique().tolist()
len(duration)

In [None]:
# shuffle song's duration
random.shuffle(duration)

# extract 90% of song's duration
duration_train = [duration[i] for i in range(round(0.9*len(duration)))]

# split data into train and validation set
train_df = df[df['song_duration_ms'].isin(duration_train)]
validation_df = df[~df['song_duration_ms'].isin(duration_train)]

In [None]:
# list to capture song's title
songs_train = []

# populate the list with the song's title
for i in tqdm(duration_train):
    temp = train_df[train_df["song_duration_ms"] == i]["song_name"].tolist()
    songs_train.append(temp)

In [None]:
# list to capture song's title
songs_val = []

# populate the list with the song's title
for i in tqdm(validation_df['song_duration_ms'].unique()):
    temp = validation_df[validation_df["song_duration_ms"] == i]["song_name"].tolist()
    songs_val.append(temp)

In [None]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(songs_train, progress_per=100)

model.train(songs_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

In [None]:
model.init_sims(replace=True)

In [None]:
print(model)

In [None]:
# extract all vectors
X = model[model.wv.vocab]

X.shape

In [None]:
songs = train_df[["song_name", "track_name"]]

# remove duplicates
songs.drop_duplicates(inplace=True, subset='song_name')

# create product-ID and product-description dictionary
songs_dict = songs.groupby('song_name')['track_name'].apply(list).to_dict()

In [None]:
# test the dictionary
songs_dict['By The Way']

In [None]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (songs_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms

In [None]:
similar_products(model['Ordinary People'])