In [1]:
import numpy as np
import pandas as pd

In [2]:
user_listen = pd.read_csv('Copy of explicit_data - Data preprocessing - songs.csv')
songs = pd.read_csv('Copy of explicit_data - Songs - All.csv')
songs_with_artist_id = pd.read_csv('Copy of explicit_data - Songs - All-with artist_id.csv')

In [3]:
# songs.head()

In [4]:
df_merge = pd.merge(user_listen, songs.drop_duplicates(['song_id']), on="song_id", how="left")

In [5]:
# df_merge.head()

In [6]:
# merge title and artist to a one column
df_merge['song'] = df_merge[['Title', 'Artist']].apply(lambda x: ' - '.join(x), axis=1)
df_merge.head()

Unnamed: 0,user_id,age_group,gender,profession,hours_spending,musical_aspect,song_id,Title,Artist,song
0,10001,18 - 24,Female,Student,0 - 1,Tempo/speed,174,Sandaganawa,Dhanith Sri,Sandaganawa - Dhanith Sri
1,10001,18 - 24,Female,Student,0 - 1,Tempo/speed,220,Api hagum walata ida dee mohothak,Victor Rathnayaka,Api hagum walata ida dee mohothak - Victor Rat...
2,10002,25 - 34,Male,Working,2 - 3,The singer's voice,221,Mandaram Wahi Watena,6th Lane,Mandaram Wahi Watena - 6th Lane
3,10003,18 - 24,Female,Software Engineer,more than 5,The singer's voice,164,Ru Sara,Bathiya & Santhush,Ru Sara - Bathiya & Santhush
4,10004,18 - 24,Male,software engineer,more than 5,The singer's voice,6,Adanne Ay Sudu Manike,H. R. Jothipala,Adanne Ay Sudu Manike - H. R. Jothipala


In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.sparse import csr_matrix

In [7]:
df_merge['listened_song'] = np.ones((441,), dtype=int)
df_merge.head()

Unnamed: 0,user_id,age_group,gender,profession,hours_spending,musical_aspect,song_id,Title,Artist,song,listened_song
0,10001,18 - 24,Female,Student,0 - 1,Tempo/speed,174,Sandaganawa,Dhanith Sri,Sandaganawa - Dhanith Sri,1
1,10001,18 - 24,Female,Student,0 - 1,Tempo/speed,220,Api hagum walata ida dee mohothak,Victor Rathnayaka,Api hagum walata ida dee mohothak - Victor Rat...,1
2,10002,25 - 34,Male,Working,2 - 3,The singer's voice,221,Mandaram Wahi Watena,6th Lane,Mandaram Wahi Watena - 6th Lane,1
3,10003,18 - 24,Female,Software Engineer,more than 5,The singer's voice,164,Ru Sara,Bathiya & Santhush,Ru Sara - Bathiya & Santhush,1
4,10004,18 - 24,Male,software engineer,more than 5,The singer's voice,6,Adanne Ay Sudu Manike,H. R. Jothipala,Adanne Ay Sudu Manike - H. R. Jothipala,1


# Playlist based on user favourites

In [23]:
def user_item_matrix(user_listen, songs):
    ratings = pd.merge(user_listen, songs.drop_duplicates(['song_id']), on="song_id", how="left")
    ratings['listened_song'] = np.ones((441,), dtype=int)
    
    # pivot ratings into song features
    df_song_features = ratings.pivot(
        index='song_id',
        columns='user_id',
        values='listened_song'
    ).fillna(0)

    return df_song_features

def sparse_matrix(user_listen, songs):
    df_song_features = user_item_matrix(user_listen, songs)
    user_item_mat = csr_matrix(df_song_features.values)
    return user_item_mat
    

In [24]:
def song_idx_mapping(user_listen, songs):
    df_song_features = user_item_matrix(user_listen, songs)
    song_to_idx = {
    song: i for i, song in 
    enumerate(list(songs.set_index('song_id').loc[df_song_features.index].Title))
    }
    return song_to_idx

In [19]:
from sklearn.neighbors import NearestNeighbors

In [25]:
def KNN_model(user_listen, songs):
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    song_user_mat_sparse = sparse_matrix(user_listen, songs)
    model_knn.fit(song_user_mat_sparse)
    return model_knn

In [14]:
from fuzzywuzzy import fuzz



In [15]:
def fuzzy_matching(mapper, fav_song, verbose=True):
    """
    return the closest match via fuzzy ratio. 
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True
    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_song.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]


In [13]:
def DPG_recommendation(model_knn, data, mapper, fav_song, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie
    Parameters
    ----------
    model_knn: sklearn model, knn model
    data: movie-user matrix
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    n_recommendations: int, top n recommendations
    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_song)
    idx = fuzzy_matching(mapper, fav_song, verbose=True)
    
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_song))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [28]:
my_favorite = 'Ru Sara'

DPG_recommendation(
    model_knn=KNN_model(user_listen, songs),
    data=sparse_matrix(user_listen, songs),
    fav_song=my_favorite,
    mapper=song_idx_mapping(user_listen, songs),
    n_recommendations=10)

You have input movie: Ru Sara
Found possible matches in our database: ['Ru Sara']

Recommendation system start to make inference
......

Recommendations for Ru Sara:
1: Ran wan mal dam, with distance of 1.0
2: Adaren (Lanwenna Hithuwata), with distance of 1.0
3: Sinha Lokaye Sinhaya, with distance of 1.0
4: Rahath himiwaru, with distance of 1.0
5: Galana ganga, with distance of 1.0
6: Akeekaru pem kathawak, with distance of 1.0
7: Husmath unui, with distance of 1.0
8: Hama deyak pene, with distance of 1.0
9: Ulkapathayak, with distance of 1.0
10: Sansara Sihine , with distance of 1.0


# top-N recommendation playlist

In [16]:
def create(df_merge, category):
    # get a count of user_ids for each unique song as recommendation score
    data_grouped = df_merge.groupby(['song']).agg({category: 'count'}).reset_index()
    data_grouped.rename(columns = {'user_id': 'score'},inplace=True)

    # Sort the songs based upon recommendation score
    data_sort = data_grouped.sort_values(['score', 'song'], ascending = [0,1])

    # Generate a recommendation rank based upon score
    data_sort['Rank'] = data_sort['score'].rank(ascending=0, method='first')

    # Get the top 10 recommendations
    popularity_recommendations = data_sort.head(10) 
    return popularity_recommendations

def top_N_recommendations(df_merge, category):
    user_recommendations = create(df_merge, category)
    #Add column for which the recommendations are being generated
    #user_recommendations['user_id'] = category
    
    cols = user_recommendations.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    user_recommendations = user_recommendations[cols]
    user_recommendations.reset_index(drop=True, inplace = True)
    return user_recommendations

In [17]:
top_N_recommendations(df_merge = df_merge,
                     category = 'user_id')

Unnamed: 0,Rank,song,score
0,1.0,Sandaganawa - Dhanith Sri,18
1,2.0,Kuweni - Ridma Weerawardena ft Dinupa Kodagoda,16
2,3.0,Pandama - Danith Dri,14
3,4.0,Sandanari - Harsha Withanage,9
4,5.0,Prathihari - Supun Perera,8
5,6.0,Nura wasanthe - Nadeemal Perera,7
6,7.0,Perawadanak - Sanuka Wickramasinghe,7
7,8.0,Chandrayan Pidu Kiranak Sagawala Horen - Daddy,5
8,9.0,Kaulu Piyan Path Wahanna - Kasun Kalhara,5
9,10.0,Radhawani - Supun Perera,5


# Playist based on user

In [17]:
from gensim.models import FastText
import re

In [9]:
# df_merge.head()

In [10]:
# Get unique items (songs) corresponding to a given user
def get_user_items(user_id):
    user_data = df_merge[df_merge['user_id'] == user_id]
    user_items = list(user_data['song'].unique())
    return user_items
    
# Get unique users for a given item (song)
def get_item_users(song):
    item_data = df_merge[df_merge['song'] == song]
    item_users = set(item_data['user_id'].unique())
    return item_users
    
# Get unique items (songs) in the training data
def get_all_items_train_data():
    all_items = list(df_merge['song'].unique())
    return all_items

def get_item_users_by_title(Title):
    item_data = df_merge[df_merge['Title'] == Title]
    item_users_ = set(item_data['user_id'].unique())
    return item_users_

In [31]:
from numpy import savetxt

In [34]:
music = songs 
song_name = music.Title.values
song_name_clean = [re.sub(r'[^\w]', ' ', str(item))for item in song_name]
song_name_clean = [re.sub(r" \d+", '', str(item.strip())) for item in song_name_clean]

sentences = list()
for item in song_name_clean:
    sentences.append(item.split())
unique_sentence = np.unique(sentences)


In [21]:

# generate similar songs for new items
def generate_similars(song_name):
    
    # load the trained model
    model = FastText.load('word2vec.model')
    
    # split the song title
    tokens = song_name.split() 
    
    suggestions = []
    
    # check for most similar items form the model
    suggestions.append(model.wv.most_similar(positive=tokens, topn=10))
    
    predictions = []
    for l in range(len(suggestions[0])):
        for i in range(len(unique_sentence)):
            for j in range(len(unique_sentence[i])):
                if unique_sentence[i][j] == suggestions[0][l][0]:
    #                 print(unique_sentence[i])
                    s = ' '
                    word = s.join(unique_sentence[i])
    #                 print(word)
                    predictions.append(word)

    return predictions

def recommend_new_items(df_merge, user_id, new_song):
    
    predictions = generate_similars(new_song)
    for item in predictions:
        for value in get_item_users_by_title(item):
            if value == user_id:
                return new_song
            else:
                continue

In [25]:
new_item = recommend_new_items(df_merge, 10129, 'Sarage Asille')
new_item

'Sarage Asille'

In [27]:
# Construct cooccurence matrix
def construct_cooccurence_matrix(user_songs, all_songs, df_merge):
    user_songs_users = []
    for i in range(0, len(user_songs)):
        user_songs_users.append(get_item_users(user_songs[i]))
            
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
        
    for i in range(0, len(all_songs)):
        # Calculate unique listeners (users) of song (item) i
        songs_i_data = df_merge[df_merge['song'] == all_songs[i]]
        users_i = set(songs_i_data['user_id'].unique())
#         print(songs_i_data)
#         print(users_i)
            
        for j in range(0, len(user_songs)):
            # Get unique listeners (users) of song (item) j
            users_j = user_songs_users[j]
                
            # Calculate intersection of listeners of songs i and j
            users_intersection = users_i.intersection(users_j)
                
            # Calculate cooccurence_matrix[i,j] as Jaccard Index
            if len(users_intersection) != 0:
                # Calculate union of listeners of songs i and j
                users_union = users_i.union(users_j)
                    
                cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                
            else:
                cooccurence_matrix[j,i] = 0
                    
    return cooccurence_matrix
    
# Use the cooccurence matrix to make top recommendations
def generate_top_recommendations(user_id, cooccurence_matrix, all_songs, user_songs, new_song = None):
    print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
    # Calculate a weighted average of the scores in cooccurence matrix for all user songs.
    user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
    user_sim_scores = np.array(user_sim_scores)[0].tolist()
        
    # Sort the indices of user_sim_scores based upon their value Also maintain the corresponding score
    sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
    # Create a dataframe from the following
    columns = ['user_id', 'song', 'score', 'rank']
    # index = np.arange(1) # array of numbers for the number of samples
    df = pd.DataFrame(columns=columns)
        
    # Fill the dataframe with top 10 item based recommendations
    rank = 1 
    for i in range(0,len(sort_index)):
        if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
            df.loc[len(df)]=[user_id,all_songs[sort_index[i][1]],sort_index[i][0],rank]
            rank = rank+1
        
        # Handle the case where there are no recommendations
    if df.shape[0] == 0:
        print("The current user has no songs for training the item similarity based recommendation model.")
        return -1
    else:
        if (new_song != None):
            df = df.append({'user_id' : user_id, 'song' : new_song} , ignore_index=True)
            return df
        else:
            return df
        

    # Use the item similarity based recommender system model to make recommendations
def recommend_songs(user_id, df_merge, new_song = None):
    user_songs = get_user_items(user_id)    
    print("No. of unique songs for the user: %d" % len(user_songs))
 
    all_songs = get_all_items_train_data()
        
    print("no. of unique songs in the training set: %d" % len(all_songs))
    
    cooccurence_matrix = construct_cooccurence_matrix(user_songs, all_songs, df_merge)

    if (new_song != None):
        new_item = recommend_new_items(df_merge, user_id, new_song)
        df_recommendations = generate_top_recommendations(user_id, cooccurence_matrix, all_songs, user_songs, new_item)
    else:
        df_recommendations = generate_top_recommendations(user_id, cooccurence_matrix, all_songs, user_songs)
                
                
    return df_recommendations

        

In [28]:
recommend_songs(user_id = 10001, df_merge = df_merge, new_song = 'Saragee Asille')

No. of unique songs for the user: 2
no. of unique songs in the training set: 227
Non zero values in cooccurence_matrix :24


Unnamed: 0,user_id,song,score,rank
0,10001,Pandama - Danith Dri,0.051724,1
1,10001,Saragaye (Niya Rata Mawanawa) - Sanuka Wickram...,0.05,2
2,10001,Kuweni - Ridma Weerawardena ft Dinupa Kodagoda,0.048387,3
3,10001,Prathihari - Supun Perera,0.041667,4
4,10001,Oba apple malak wage - Amarasiri Peiris,0.027778,5
5,10001,Mage Unmade - Sangeeth Iddamalgoda,0.027778,6
6,10001,Runawiye - DKM ft. YAKA,0.027778,7
7,10001,Samawak na - Cairo Rich,0.027778,8
8,10001,Hitha Mithuru Sulaga - Victor Rathnayaka,0.027778,9
9,10001,Surath Suwaya(Nil Warala Pura) - Supun Perera ...,0.027778,10


In [None]:
# # Construct cooccurence matrix
# def construct_cooccurence_matrix(user_songs, all_songs, df_merge):
#     user_songs_users = []
#     for i in range(0, len(user_songs)):
#         user_songs_users.append(get_item_users(user_songs[i]))
            
#         cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
        
#     for i in range(0, len(all_songs)):
#         # Calculate unique listeners (users) of song (item) i
#         songs_i_data = df_merge[df_merge['song'] == all_songs[i]]
#         users_i = set(songs_i_data['user_id'].unique())
#         print(songs_i_data)
#         print(users_i)
            
#         for j in range(0, len(user_songs)):
#             # Get unique listeners (users) of song (item) j
#             users_j = user_songs_users[j]
                
#             # Calculate intersection of listeners of songs i and j
#             users_intersection = users_i.intersection(users_j)
                
#             # Calculate cooccurence_matrix[i,j] as Jaccard Index
#             if len(users_intersection) != 0:
#                 # Calculate union of listeners of songs i and j
#                 users_union = users_i.union(users_j)
                    
#                 cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                
#             else:
#                 cooccurence_matrix[j,i] = 0
                    
#     return cooccurence_matrix
    

# content-based feeding

For the music data, we have "song_id", "title", "release", "artist_name", "year"

We can first use Word2Vec to convert the title and artist name into vectors, then with some normalization, combining it with year, we can use knn to generate the song embedding (V) for unseen item and feed it back to the collobrative model in order to recommend unseen songs to users

In [3]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize 
from gensim.models import word2vec

In [4]:
import gensim 

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [6]:
import os

In [90]:
from keras.models import Sequential
from keras.layers import Embedding

In [7]:
songs.head()

Unnamed: 0,song_id,Title,Artist
0,2,Aa Ra Sulan,Nirosha Virajini
1,283,Aale katha,"Kalpana Nayanamadu, Shermaine Willis ft Iraj"
2,3,Ada Nam Ma Hada Iwasum Na,Raveen Kanishka & Kalpana Kavindi
3,4,Ada Thaniyen Ma Hadanne Na Ma,Shihan Mihiranga
4,5,Adambarai Baluwama Nam,Surani De Mel


In [69]:
# # Create a list of strings, one for each title
# titles_list = [title for title in songs['Title']]
# big_title_string = ' '.join(titles_list)

# # Tokenize the string into words
# tokens = word_tokenize(big_title_string)

# # Remove non-alphabetic tokens, such as punctuation
# words = [word.lower() for word in tokens if word.isalpha()]

# # Print first 10 words
# words[:10]


In [9]:
# model = gensim.models.Word2Vec.load("word2vec.model")

In [70]:
# vector_list = [model[word] for word in words if word in model.wv.vocab]

# # Create a list of the words corresponding to these vectors
# words_filtered = [word for word in words if word in model.wv.vocab]

# # Zip the words together with their vector representations
# word_vec_zip = zip(words_filtered, vector_list)

# # Cast to a dict so we can turn it into a DataFrame
# word_vec_dict = dict(word_vec_zip)
# df = pd.DataFrame.from_dict(word_vec_dict, orient='index')
# df.head(3)

In [23]:
# artist = set(song_df.artist_name.values)
music = songs 
song_name = music.Title.values
song_name_clean = [re.sub(r'[^\w]', ' ', str(item))for item in song_name]
song_name_clean = [re.sub(r" \d+", '', str(item.strip())) for item in song_name_clean]

In [25]:
sentences = list()
for item in song_name_clean:
    sentences.append(item.split())
unique_sentence = np.unique(sentences) # build the model on all unique sentence but not all sentence to save time

In [105]:
unique_sentence

array([list(['Aa', 'Ra', 'Sulan']), list(['Aale', 'katha']),
       list(['Ada', 'Nam', 'Ma', 'Hada', 'Iwasum', 'Na']),
       list(['Ada', 'Thaniyen', 'Ma', 'Hadanne', 'Na', 'Ma']),
       list(['Adambarai', 'Baluwama', 'Nam']),
       list(['Adanne', 'Ay', 'Sudu', 'Manike']),
       list(['Adaraneeya', 'Neranjana']), list(['Adaraya', 'Ayai']),
       list(['Adare', 'sithum']), list(['Adarema', 'Geethayak']),
       list(['Adaren', 'Lanwenna', 'Hithuwata']),
       list(['Aduru', 'kutiya', 'thula']), list(['Ae']),
       list(['Ahasin', 'eha']), list(['Ahasin', 'polowata']),
       list(['Ai', 'Kale', 'Adare']),
       list(['Ai', 'kale', 'mulu', 'hadinma']),
       list(['Akeekaru', 'pem', 'kathawak']), list(['Alawanthakam']),
       list(['Alen', 'Ma']), list(['Alen', 'Wela', 'Ganna']),
       list(['Amma', 'Sandaki']), list(['Anagathaye']),
       list(['Ananthayata', 'Yana', 'Para', 'Dige']), list(['Ananthaye']),
       list(['Anatha', 'maruthe']),
       list(['Api', 'hagum', 'wa

In [28]:
# ## create word2vec model
# # Set values for NN parameters
# num_features = 50    # Word vector dimensionality                      
# min_word_count = 1                      
# num_workers = 1      # Number of CPUs
# context = 3          # Context window size; 
                                                                                                        
# downsampling = 1e-3   # threshold for configuring which 
#                       # higher-frequency words are randomly downsampled

# # Initialize and train the model 
# model_wv = gensim.models.Word2Vec(unique_sentence, workers=num_workers, \
#             size=num_features, min_count = min_word_count, \
#             window = context, sample = downsampling, sg = 1)

# # model_wv.build_vocab(unique_sentence, progress_per=200)

# # model_wv.train(unique_sentence, total_examples = model.corpus_count, 
# #             epochs=10, report_delay=1)

# # If you don't plan to train the model any further, calling 
# # init_sims will make the model much more memory-efficient.
# model_wv.init_sims(replace=True)

In [122]:
# model_wv.save("word2vec.model")

In [29]:
# X = model[model.wv.vocab]

# X.shape

  """Entry point for launching an IPython kernel.


(653, 50)

In [15]:
# import umap
# import umap.umap_ as umap

In [16]:
# cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
#                               n_components=2, random_state=42).fit_transform(X)

In [18]:
# def similar_products(v, n = 6):
#     products_dict =  {}
#     # extract most similar products for the input vector
#     ms = model.similar_by_vector(v, topn= n+1)[1:]
    
#     # extract name and similarity score of the similar products
#     new_ms = []
#     for j in ms:
#         pair = (products_dict[j[0]][0], j[1])
#         new_ms.append(pair)
        
#     return new_ms     

In [68]:
# model_wv.most_similar(u'Wawannema') # similar word to love

In [66]:
# model_wv.most_similar('Ru')

In [67]:
# print(model_wv.predict_output_word(['Saragi'], topn = 5))

In [49]:
from gensim.models import FastText

In [50]:
## create word2vec model
# Set values for NN parameters
num_features = 50    # Word vector dimensionality                      
min_word_count = 1                      
num_workers = 1      # Number of CPUs
context = 3          # Context window size; 
                                                                                                        
downsampling = 1e-3   # threshold for configuring which 
                      # higher-frequency words are randomly downsampled

# Initialize and train the model 
model_wv = FastText(workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, sg = 1)

model_wv.build_vocab(sentences = unique_sentence)
model_wv.train(sentences = unique_sentence,  total_examples=len(unique_sentence), epochs=10)

model_wv.init_sims(replace=True)

In [51]:
model_wv.save('word2vec.model')
model = FastText.load('word2vec.model')

In [52]:
model.most_similar('Adare')

  """Entry point for launching an IPython kernel.


[('Adarema', 0.5569604635238647),
 ('Adaren', 0.48059654235839844),
 ('mal', 0.459030419588089),
 ('Pokuru', 0.3751647472381592),
 ('Yaal', 0.3603614270687103),
 ('nube', 0.35141992568969727),
 ('Adaraneeya', 0.33996152877807617),
 ('Nathi', 0.3362344205379486),
 ('Sulagak', 0.32758551836013794),
 ('Pinna', 0.30353084206581116)]

In [53]:
model.similarity('Adare', 'Adaren')

  """Entry point for launching an IPython kernel.


0.48059654

In [56]:
print(model.wv['Saragi'])

[-0.00484305 -0.03475127 -0.00661441  0.05109205  0.02670166  0.02411269
  0.01072268 -0.00809084  0.01354577 -0.04733701 -0.03353004  0.02331018
  0.00455881  0.03103582  0.0403338  -0.00735925  0.03735436  0.01877754
  0.06927647  0.01262611 -0.05566176  0.04455525  0.01797638  0.08560292
  0.02440351 -0.02470134 -0.04367522 -0.01514639 -0.00401336 -0.04286401
 -0.04277678  0.03670257  0.06413483 -0.01559428  0.0487486   0.02490747
  0.03766687  0.03939747  0.03528021  0.02880044  0.0327778  -0.00210523
  0.0300089   0.03726153 -0.01770418  0.01076948  0.0094786  -0.03558464
  0.0769246   0.00633751]


In [58]:
model.most_similar('Asille')

  """Entry point for launching an IPython kernel.


[('Arabumama', 0.374399870634079),
 ('Samawak', 0.36850494146347046),
 ('Rangume', 0.34819692373275757),
 ('Lanka', 0.345318466424942),
 ('mohothak', 0.3197134733200073),
 ('atha', 0.3117147386074066),
 ('Dun', 0.30848634243011475),
 ('Assata', 0.29812291264533997),
 ('Hadinma', 0.29117846488952637),
 ('Pitasakwala', 0.29103660583496094)]

In [119]:
model.most_similar(positive=['Saragi', 'Asille'], topn=5)

  """Entry point for launching an IPython kernel.


[('Samawak', 0.4077220857143402),
 ('Rangume', 0.3338375687599182),
 ('Assata', 0.33264869451522827),
 ('Arabumama', 0.32684507966041565),
 ('Dun', 0.32256919145584106)]

In [71]:
suggestions = []
suggestions.append(model.most_similar('Asille'))
suggestions

  


[[('Arabumama', 0.374399870634079),
  ('Samawak', 0.36850494146347046),
  ('Rangume', 0.34819692373275757),
  ('Lanka', 0.345318466424942),
  ('mohothak', 0.3197134733200073),
  ('atha', 0.3117147386074066),
  ('Dun', 0.30848634243011475),
  ('Assata', 0.29812291264533997),
  ('Hadinma', 0.29117846488952637),
  ('Pitasakwala', 0.29103660583496094)]]

In [92]:
suggestions[0]

[('Arabumama', 0.374399870634079),
 ('Samawak', 0.36850494146347046),
 ('Rangume', 0.34819692373275757),
 ('Lanka', 0.345318466424942),
 ('mohothak', 0.3197134733200073),
 ('atha', 0.3117147386074066),
 ('Dun', 0.30848634243011475),
 ('Assata', 0.29812291264533997),
 ('Hadinma', 0.29117846488952637),
 ('Pitasakwala', 0.29103660583496094)]

In [80]:
unique_sentence[1]

['Aale', 'katha']

In [93]:
for i in range (len(suggestions[0])):
    print(suggestions[0][i][0])

Arabumama
Samawak
Rangume
Lanka
mohothak
atha
Dun
Assata
Hadinma
Pitasakwala


In [97]:
predictions = []
for l in range(len(suggestions[0])):
    for i in range(len(unique_sentence)):
        for j in range(len(unique_sentence[i])):
            if unique_sentence[i][j] == suggestions[0][l][0]:
#                 print(unique_sentence[i])
                s = ' '
                word = s.join(unique_sentence[i])
#                 print(word)
                predictions.append(word)
                
predictions

['Arabumama Kadulak Wela Ma Bala Iddi',
 'Samawak na',
 'Rangume',
 'Lanka Matha',
 'Api hagum walata ida dee mohothak',
 'Athinwath atha',
 'Sadha Tharu Mal Mata Dan Dun',
 'Hitha Assata',
 'Ay Kale Mulu Hadinma Oba Mata Adare',
 'Duka Danna Nisai Mulu Hadinma',
 'Pitasakwala Yaane']

In [107]:
df_merge.head()

Unnamed: 0,user_id,age_group,gender,profession,hours_spending,musical_aspect,song_id,Title,Artist,song,listened_song
0,10001,18 - 24,Female,Student,0 - 1,Tempo/speed,174,Sandaganawa,Dhanith Sri,Sandaganawa - Dhanith Sri,1
1,10001,18 - 24,Female,Student,0 - 1,Tempo/speed,220,Api hagum walata ida dee mohothak,Victor Rathnayaka,Api hagum walata ida dee mohothak - Victor Rat...,1
2,10002,25 - 34,Male,Working,2 - 3,The singer's voice,221,Mandaram Wahi Watena,6th Lane,Mandaram Wahi Watena - 6th Lane,1
3,10003,18 - 24,Female,Software Engineer,more than 5,The singer's voice,164,Ru Sara,Bathiya & Santhush,Ru Sara - Bathiya & Santhush,1
4,10004,18 - 24,Male,software engineer,more than 5,The singer's voice,6,Adanne Ay Sudu Manike,H. R. Jothipala,Adanne Ay Sudu Manike - H. R. Jothipala,1


In [120]:
string = 'Saragi Asille'
l = string.split()
l

['Saragi', 'Asille']

In [138]:
def get_item_users_by_title(Title):
    item_data = df_merge[df_merge['Title'] == Title]
    item_users = set(item_data['user_id'].unique())
    return item_users

In [123]:
print(get_item_users('Arabumama Kadulak Wela Ma Bala Iddi'))

{10129, 10221}


In [125]:
# generate similar songs for new items
def generate_similars(song_name):
    
    # load the trained model
    model = FastText.load('word2vec.model')
    
    # split the song title
    tokens = song_name.split() 
    
    suggestions = []
    
    # check for most similar items form the model
    suggestions.append(model.wv.most_similar(positive=tokens, topn=10))
    
    predictions = []
    for l in range(len(suggestions[0])):
        for i in range(len(unique_sentence)):
            for j in range(len(unique_sentence[i])):
                if unique_sentence[i][j] == suggestions[0][l][0]:
    #                 print(unique_sentence[i])
                    s = ' '
                    word = s.join(unique_sentence[i])
    #                 print(word)
                    predictions.append(word)

    return predictions
    

In [139]:
def recommend_new_items(song_name, user_id, df_merge):
    
    predictions = generate_similars(song_name)
    for item in predictions:
        for value in get_item_users_by_title(item):
            if value == user_id:
                return song_name
            else:
                continue
    return 0
            

In [129]:
print(recommend_new_items('Saragi Asille', 10129, df_merge))

Saragi Asille


In [102]:
# import numpy as np
# from numpy import linalg as LA

In [65]:
# def cosine_distance (model, word,target_list , num) :
#     cosine_dict ={}
#     word_list = []
#     a = model[word]
#     for item in target_list :
#         for i in item:
#             if i != word :
#                 b = model [i]
#                 cos_sim = np.dot(a, b)/(LA.norm(a)*LA.norm(b))
#                 cosine_dict[i] = cos_sim
#     dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descedning order 
#     for item in dist_sort:
#         word_list.append((item[0], item[1]))
#     return word_list[0:num]

# cosine_distance (model_wv,'Adare',unique_sentence,10)


In [98]:
# https://medium.com/cisco-emerge/creating-semantic-representations-of-out-of-vocabulary-words-for-common-nlp-tasks-842dbdafba18
# https://towardsdatascience.com/fasttext-under-the-hood-11efc57b2b3
# https://pathmind.com/wiki/word2vec
# https://github.com/manasRK/word2vec-recommender/blob/master/loadReviewModel.py
# https://medium.com/building-creative-market/word2vec-inspired-recommendations-in-production-f2c6a6b5b0bf
# https://arxiv.org/pdf/1601.01356.pdf
# https://towardsdatascience.com/word2vec-for-phrases-learning-embeddings-for-more-than-one-word-727b6cf723cf
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
# https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92?#702d
# https://towardsdatascience.com/using-word2vec-for-music-recommendations-bb9649ac2484
# https://github.com/YIZHE12/music_recom/blob/master/music_recommendation_binary.ipynb
# https://towardsdatascience.com/using-word2vec-to-analyze-news-headlines-and-predict-article-success-cdeda5f14751
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
# https://www.analyticsvidhya.com/blog/2019/07/how-to-build-recommendation-system-word2vec-python/

Converting the song title one by and one and average the word vector is too time-consuming, we can take advantage of the GPU by building a neural network model and fix the weights as the Word2Vec weights to convert our data to vector

In [85]:
# X = sentences
# EMBEDDING_DIM = num_features
# max_length = max([len(s) for s in X])
# # maximum length of a number of ingredients

# tokenizer_obj = Tokenizer()
# tokenizer_obj.fit_on_texts(X)

# X_token = tokenizer_obj.texts_to_sequences(X)
# X_pad = pad_sequences(X_token, maxlen = max_length, padding = 'post')

In [88]:
# embeddings_index = {}
# f = open(os.path.join('','song_tile_embedding.txt'), encoding = 'utf-8')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:])
#     embeddings_index[word] = coefs
    
# f.close()

# word_index = tokenizer_obj.word_index
# num_words = len(word_index) + 1

# embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector[-EMBEDDING_DIM:]

In [93]:
# model_wv_seq.save_weights('model_wv_seq.hdf5')

In [105]:
# model_wv_seq

<keras.engine.sequential.Sequential at 0x1d66dca1508>

In [64]:
# new_title = 'Saragi Asille'
# new_sentences = []
# new_sentences.append(new_title.split())
# new_sentences
# # print(new_title.split())
# # new_name_clean = [re.sub(r'[^\w]', ' ', str(item))for item in new_title]
# # new_name_clean = [re.sub(r" \d+", '', str(item.strip())) for item in new_name_clean]
# # new_sentences = list()
# # for item in new_name_clean:
# #     new_sentences.append(item.split())
# # new_name_clean

In [63]:
# X_token = tokenizer_obj.texts_to_sequences(new_sentences)
# X_pad_new = pad_sequences(X_token, maxlen = max_length, padding = 'post')
# Song_vector_new = model_wv_seq.predict(X_pad_new)

# Song_vector_copy = Song_vector_new.copy()
# Song_vector_copy[Song_vector_copy == 0] = np.nan
# means_new_song = np.nanmean(Song_vector_copy, axis=1) # the first axis of mean is example

In [113]:
# np.nonzero(Song_vector_new)

(array([], dtype=int64), array([], dtype=int64), array([], dtype=int64))

In [114]:
# Song_vector = model_wv_seq.predict(X_pad[0:16,:],  batch_size = 4)

In [62]:
# Song_vector_copy = Song_vector.copy()
# Song_vector_copy[Song_vector_copy == 0] = np.nan
# means = np.nanmean(Song_vector_copy, axis=1) # the first axis of mean is example

In [61]:
# np.shape(means)

In [60]:
# means[0]

In [117]:
# from sklearn.metrics.pairwise import cosine_similarity

In [59]:
# scores = cosine_similarity(means[0].reshape(1, -1), means[2].reshape(1, -1))
# scores