In [88]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import FastText
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")



In [241]:
from Word2vec import make_suggestion
from playlists_artists import get_similar_songs

In [242]:
playlists = pd.read_csv('../Datasets/Copy of explicit_data - Playlist_data.csv')
user_playlist_data = pd.read_csv('../Datasets/Copy of explicit_data - User_playlists.csv')
songs = pd.read_csv('../Datasets/Copy of explicit_data - Songs - All.csv')

In [3]:
playlists.head()

Unnamed: 0,playlist_id,playlist_name,user_id
0,101,Top Sinhala Hits,10001
1,102,New Sinhala songs,10004
2,103,Sinhala Rap Collection,10019
3,104,Christmas Songs Collection,10022
4,105,Mother's Songs,10065


In [17]:
user_playlist_data.head()

Unnamed: 0,playlist_id,timestamp,order,song_id
0,101,1396767000.0,25,312
1,101,1396767000.0,26,313
2,101,1396767000.0,24,314
3,101,1396767000.0,23,315
4,101,1396767000.0,22,316


In [5]:
songs.head()

Unnamed: 0,song_id,Title,Artist,Artist_id,Album,Release Year
0,2,Aa Ra Sulan,Nirosha Virajini,21.0,Aa Ra Sulan,2011.0
1,283,Aale katha,"Kalpana Nayanamadu, Shermaine Willis ft Iraj",11.0,Aale Katha,2018.0
2,3,Ada Nam Ma Hada Iwasum Na,Raveen Kanishka & Kalpana Kavindi,101.0,,
3,4,Ada Thaniyen Ma Hadanne Na Ma,Shihan Mihiranga,62.0,,
4,5,Adambarai Baluwama Nam,Surani De Mel,80.0,,


In [243]:
user_playlist_data.columns = ['playlist_id','timestamp','order','song_id']

In [298]:
class Automatic_Playlist_Continuation:
    def __init__(self):
        self.merged_data = None
        
    def merge_data(self, user_playlist_data, songs):
        self.merged_data = pd.merge(user_playlist_data, songs.drop_duplicates(['song_id']), on="song_id", how="left")
        self.merged_data['ratings'] = np.ones((self.merged_data.shape[0],), dtype=int)
        return self.merged_data
    
    def get_playlist_name(self, playlist_id, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_name = df_merge.loc[df_merge['playlist_id']==playlist_id]
        return playlist_name

    def playlist_songs_matrix(self, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_songs_matrix = df_merge.pivot(
                index='playlist_id',
                columns='song_id',
                values='ratings'
            ).fillna(0)
        return playlist_songs_matrix
    
    def SVD(self, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        ps_matrix = self.playlist_songs_matrix(user_playlist_data, songs)
        X = ps_matrix.values.T
        SVD = TruncatedSVD(n_components=6, random_state=0)
        matrix = SVD.fit_transform(X)
        return matrix
    
    def corelations_between_songs(self, playlist_id, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_details = self.get_playlist_name(playlist_id, user_playlist_data, songs)
        seed_tracks = playlist_details.song_id.values
        matrix = SVD(df_merge)
        corr = np.corrcoef(matrix)

        ps_matrix = self.playlist_songs_matrix(user_playlist_data, songs)
        song_ids= ps_matrix.columns
        Song_id_list = list(song_ids)
        suggestions = []
        uniques = []
        for i in range(len(seed_tracks)):
            track_rating = title_list.index(seed_tracks[i])
            corr_samia  = corr[track_rating]
            suggestions.append(song_ids[(corr_samia >= 0.5)])
        for i in range(len(suggestions)):
            for j in suggestions[i]:
                uniques.append(j)
        uniques = list(dict.fromkeys(uniques))
        return uniques  
    
    def Naive_based_search(self, pat, txt): 
        M = len(pat) 
        N = len(txt) 

        # A loop to slide pat[] one by one */ 
        for i in range(N - M + 1): 
            j = 0        
            # For current index i, check  
            # for pattern match */ 
            while(j < M): 
                if (txt[i + j] != pat[j]): 
                    break
                j += 1

            if (j == M):  
                return True
            else:
                return False
 
    def word_extraction(self, sentence):       
        words = re.sub("[^\w]", " ",  sentence).split()    
        cleaned_text = [w for w in words]    
        return cleaned_text

    def tokenize(self, sentences):   
        words = []    
        for sentence in sentences:        
            w = self.word_extraction(sentence)        
            words.extend(w)            
            words = sorted(list(set(words)))    
            return words

    def generate_bow(self, allsentences):        
        vocab = self.tokenize(allsentences)    
        # print("Word List for Document \n{0} \n".format(vocab));
        vector_array = []
        for sentence in allsentences:        
            words = self.word_extraction(sentence)        
            bag_vector = np.zeros(len(vocab))        
            for w in words:            
                for i,word in enumerate(vocab):                
                    if word == w:                     
                        bag_vector[i] += 1                            
    #         print("{0}\n{1}\n".format(sentence,np.array(bag_vector)))
            vector_array.append(np.array(bag_vector))
        return vector_array

    def cosine_similarity_calculator(self, sentence1, sentence2):
        allsentences = [sentence1, sentence2]
        vocab = self.tokenize(allsentences)
        array = self.generate_bow(allsentences)

        feature_vec1 = array[0]
        feature_vec2 = array[1]

        c = 0

        for i in range(len(vocab)): 
            c+= feature_vec1[i]*feature_vec2[i] 
        cosine = c / float((sum(feature_vec1)*sum(feature_vec2))**0.5) 
        return cosine 
    
    def string_matching(self, playlist_id, playlists, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_name = playlists['playlist_name'].loc[playlists['playlist_id'] == playlist_id]
        playlist_name = [name for name in playlist_name]
    #     print(playlist_name)
        words = playlist_name[0].split()
        # wighted_word = words[0]
        # print(playlists.playlist_name.values)
        suggestions = []
        for playlist in playlists.playlist_name.values:
            for i in words:
                if ( self.Naive_based_search(i, playlist) and self.cosine_similarity_calculator(playlist_name[0], playlist) > 0.5):
                    playlist_id = playlists['playlist_id'].loc[playlists['playlist_name'] == playlist]
                    for i in playlist_id:
                        songs = df_merge['song_id'].loc[df_merge['playlist_id'] == i]
                        for i in songs:
                            suggestions.append(i)
                continue
        suggestions = list(dict.fromkeys(suggestions))
        return suggestions    
        
    def check_for_seed_tracks(self, playlist_id, playlists, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        no_seed_tracks = len(df_merge.playlist_id.values)
        if (no_seed_tracks >= 1):
            predictions = []
            
            cbs = self.corelations_between_songs(playlist_id, user_playlist_data, songs)
            artist_based = get_similar_songs(playlist_id, songs)
            word2vec_suggestions = make_suggestion(songs, playlist_id, df_merge)
            
            for idx in cbs:
                predictions.append(idx)
            for idx in artist_based:
                predictions.append(idx)
            for idx in word2vec_suggestions:
                predictions.append(idx)
            predictions = list(dict.fromkeys(predictions))
            
            return predictions
        elif no_seed_tracks == 0:
            strmatching = self.string_matching(playlist_id, playlists, user_playlist_data, songs)
            return strmatching 

In [299]:
apc = Automatic_Playlist_Continuation()
apc.check_for_seed_tracks(103, playlists, user_playlist_data, songs)

[23,
 27,
 67,
 68,
 97,
 129,
 141,
 148,
 235,
 60,
 65,
 307,
 170,
 181,
 195,
 150,
 203,
 19,
 53,
 54,
 55,
 56,
 57,
 58,
 76,
 86,
 104,
 306,
 130,
 278,
 131,
 132,
 303,
 133,
 134,
 135,
 248,
 168,
 200,
 328]

In [None]:
#     def tokenize_sentences(self, list_):
#         tokens = []
#         for i in list_:
#             w = self.word_extraction(i)
#             tokens.extend(w)
#             tokens = sorted(list(tokens))
#         return tokens
    
#     def frequnt_artists_in_playlist(self, playlist_id, playlists, user_playlist_data, songs):
#         df_merge = self.merge_data(user_playlist_data, songs)
#         platlist_details = self.get_playlist_name(playlist_id, user_playlist_data, songs)
#         playlist_artist_list = platlist_details.Artist.values
#         print(playlist_artist_list)
#         tokens = self.tokenize(playlist_artist_list)
#         print(tokens)
#         #wordfreq = [tokens.count(w) for w in tokens]
#         #return wordfreq 