In [88]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import FastText
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")



In [241]:
from Word2vec import make_suggestion
from playlists_artists import get_similar_songs

In [242]:
playlists = pd.read_csv('../Datasets/Copy of explicit_data - Playlist_data.csv')
user_playlist_data = pd.read_csv('../Datasets/Copy of explicit_data - User_playlists.csv')
songs = pd.read_csv('../Datasets/Copy of explicit_data - Songs - All.csv')

In [3]:
playlists.head()

Unnamed: 0,playlist_id,playlist_name,user_id
0,101,Top Sinhala Hits,10001
1,102,New Sinhala songs,10004
2,103,Sinhala Rap Collection,10019
3,104,Christmas Songs Collection,10022
4,105,Mother's Songs,10065


In [17]:
user_playlist_data.head()

Unnamed: 0,playlist_id,timestamp,order,song_id
0,101,1396767000.0,25,312
1,101,1396767000.0,26,313
2,101,1396767000.0,24,314
3,101,1396767000.0,23,315
4,101,1396767000.0,22,316


In [5]:
songs.head()

Unnamed: 0,song_id,Title,Artist,Artist_id,Album,Release Year
0,2,Aa Ra Sulan,Nirosha Virajini,21.0,Aa Ra Sulan,2011.0
1,283,Aale katha,"Kalpana Nayanamadu, Shermaine Willis ft Iraj",11.0,Aale Katha,2018.0
2,3,Ada Nam Ma Hada Iwasum Na,Raveen Kanishka & Kalpana Kavindi,101.0,,
3,4,Ada Thaniyen Ma Hadanne Na Ma,Shihan Mihiranga,62.0,,
4,5,Adambarai Baluwama Nam,Surani De Mel,80.0,,


In [243]:
user_playlist_data.columns = ['playlist_id','timestamp','order','song_id']

In [276]:
class Automatic_Playlist_Continuation:
    def __init__(self):
        self.merged_data = None
        
    def merge_data(self, user_playlist_data, songs):
        self.merged_data = pd.merge(user_playlist_data, songs.drop_duplicates(['song_id']), on="song_id", how="left")
        self.merged_data['ratings'] = np.ones((self.merged_data.shape[0],), dtype=int)
        return self.merged_data
    
    def get_playlist_name(self, playlist_id, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_name = df_merge.loc[df_merge['playlist_id']==playlist_id]
        return playlist_name

    def playlist_songs_matrix(self, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_songs_matrix = df_merge.pivot(
                index='playlist_id',
                columns='song_id',
                values='ratings'
            ).fillna(0)
        return playlist_songs_matrix
    
    def SVD(self, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        ps_matrix = self.playlist_songs_matrix(user_playlist_data, songs)
        X = ps_matrix.values.T
        SVD = TruncatedSVD(n_components=6, random_state=0)
        matrix = SVD.fit_transform(X)
        return matrix
    
    def corelations_between_songs(self, playlist_id, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_details = self.get_playlist_name(playlist_id, user_playlist_data, songs)
        seed_tracks = playlist_details.song_id.values
        matrix = SVD(df_merge)
        corr = np.corrcoef(matrix)

        ps_matrix = self.playlist_songs_matrix(user_playlist_data, songs)
        song_ids= ps_matrix.columns
        Song_id_list = list(song_ids)
        suggestions = []
        uniques = []
        for i in range(len(seed_tracks)):
            track_rating = title_list.index(seed_tracks[i])
            corr_samia  = corr[track_rating]
            suggestions.append(song_ids[(corr_samia >= 0.5)])
        for i in range(len(suggestions)):
            for j in suggestions[i]:
                uniques.append(j)
        uniques = list(dict.fromkeys(uniques))
        return uniques  
    
    def Naive_based_search(self, pat, txt): 
        M = len(pat) 
        N = len(txt) 

        # A loop to slide pat[] one by one */ 
        for i in range(N - M + 1): 
            j = 0        
            # For current index i, check  
            # for pattern match */ 
            while(j < M): 
                if (txt[i + j] != pat[j]): 
                    break
                j += 1

            if (j == M):  
                return True
            else:
                return False
 
    def word_extraction(self, sentence):       
        words = re.sub("[^\w]", " ",  sentence).split()    
        cleaned_text = [w for w in words]    
        return cleaned_text

    def tokenize(self, sentences):   
        words = []    
        for sentence in sentences:        
            w = self.word_extraction(sentence)        
            words.extend(w)            
            words = sorted(list(set(words)))    
            return words

    def generate_bow(self, allsentences):        
        vocab = self.tokenize(allsentences)    
        # print("Word List for Document \n{0} \n".format(vocab));
        vector_array = []
        for sentence in allsentences:        
            words = self.word_extraction(sentence)        
            bag_vector = np.zeros(len(vocab))        
            for w in words:            
                for i,word in enumerate(vocab):                
                    if word == w:                     
                        bag_vector[i] += 1                            
    #         print("{0}\n{1}\n".format(sentence,np.array(bag_vector)))
            vector_array.append(np.array(bag_vector))
        return vector_array

    def cosine_similarity_calculator(self, sentence1, sentence2):
        allsentences = [sentence1, sentence2]
        vocab = self.tokenize(allsentences)
        array = self.generate_bow(allsentences)

        feature_vec1 = array[0]
        feature_vec2 = array[1]

        c = 0

        for i in range(len(vocab)): 
            c+= feature_vec1[i]*feature_vec2[i] 
        cosine = c / float((sum(feature_vec1)*sum(feature_vec2))**0.5) 
        return cosine 
    
    def string_matching(self, playlist_id, playlists, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        playlist_name = playlists['playlist_name'].loc[playlists['playlist_id'] == playlist_id]
        playlist_name = [name for name in playlist_name]
    #     print(playlist_name)
        words = playlist_name[0].split()
        # wighted_word = words[0]
        # print(playlists.playlist_name.values)
        suggestions = []
        for playlist in playlists.playlist_name.values:
            for i in words:
                if ( self.Naive_based_search(i, playlist) and self.cosine_similarity_calculator(playlist_name[0], playlist) > 0.5):
                    playlist_id = playlists['playlist_id'].loc[playlists['playlist_name'] == playlist]
                    for i in playlist_id:
                        songs = df_merge['song_id'].loc[df_merge['playlist_id'] == i]
                        for i in songs:
                            suggestions.append(i)
                continue
        suggestions = list(dict.fromkeys(suggestions))
        return suggestions 
    
    def tokenize_sentences(self, list_):
        tokens = []
        for i in list_:
            w = self.word_extraction(i)
            tokens.extend(w)
            tokens = sorted(list(tokens))
        return tokens
    
    def frequnt_artists_in_playlist(self, playlist_id, playlists, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        platlist_details = self.get_playlist_name(playlist_id, user_playlist_data, songs)
        playlist_artist_list = platlist_details.Artist.values
        tokens = tokenize_sentences(playlist_artist_list)
        
        wordfreq = [tokens.count(w) for w in tokens]
        return wordfreq    
        
    def check_for_seed_tracks(self, playlist_id, playlists, user_playlist_data, songs):
        df_merge = self.merge_data(user_playlist_data, songs)
        no_seed_tracks = len(df_merge.playlist_id.values)
        if (no_seed_tracks >= 1):
            cbs = self.corelations_between_songs(playlist_id, user_playlist_data, songs)
            return cbs
        elif no_seed_tracks == 0:
            strmatching = self.string_matching(playlist_id, playlists, user_playlist_data, songs)
            return strmatching 

In [275]:
apc = Automatic_Playlist_Continuation()
# artists = songs.Artist.values
# apc.tokenize_sentences(artists)
apc.frequnt_artists_in_playlist(103, playlists, user_playlist_data, songs)
# apc.check_for_seed_tracks(103, playlists, user_playlist_data, songs)

NameError: name 'tokenize_sentences' is not defined

In [16]:
def playlist_songs_matrix(df_merge):
    playlist_songs_matrix = df_merge.pivot(
            index='playlist_id',
            columns='song_id',
            values='ratings'
        ).fillna(0)
    return playlist_songs_matrix

In [117]:
# playlists with seed tracks, use them to get the corelations using Matrix factorization
def corelations_between_songs(playlist_id, df_merge):
    playlist_details = get_playlist_name(playlist_id, df_merge)
    seed_tracks = playlist_details.song_id.values
    matrix = SVD(df_merge)
    corr = np.corrcoef(matrix)
    
    ps_matrix = playlist_songs_matrix(df_merge)
    song_ids= ps_matrix.columns
    Song_id_list = list(song_ids)
    suggestions = []
    uniques = []
    for i in range(len(seed_tracks)):
        track_rating = title_list.index(seed_tracks[i])
        corr_samia  = corr[track_rating]
        suggestions.append(song_ids[(corr_samia >= 0.5)])
    for i in range(len(suggestions)):
        for j in suggestions[i]:
            uniques.append(j)
    uniques = list(dict.fromkeys(uniques))
    return uniques   

def SVD(df_merge):
    ps_matrix = playlist_songs_matrix(df_merge)
    X = ps_matrix.values.T
    SVD = TruncatedSVD(n_components=6, random_state=0)
    matrix = SVD.fit_transform(X)
    return matrix


In [237]:
corelations_between_songs(103, df_merge)

[23, 27, 67, 68, 97, 129, 141, 148, 235]

In [110]:
# playlist_details = get_playlist_name(104, df_merge)
# seed_tracks = playlist_details.song_id.values
# seed_tracks

array([322, 323, 324, 325, 326, 327], dtype=int64)

In [95]:
# matrix = SVD(df_merge)
# corr = np.corrcoef(matrix)

In [112]:
# ps_matrix = playlist_songs_matrix(df_merge)
# song_ids= ps_matrix.columns
# title_list = list(song_ids)
# samia = title_list.index(323)
# corr_samia  = corr[samia]
# list(song_ids[(corr_samia >= 0.5)])

[322, 323, 324, 325, 326, 327, 328, 329, 330]

In [85]:
corelations_between_songs(104, df_merge)

[324, 322, 327, 326, 325, 323]

In [229]:
# token-based string matching palylists with no songs
def string_matching(playlist_id, playlists, df_merge):
    playlist_name = playlists['playlist_name'].loc[playlists['playlist_id'] == playlist_id]
    playlist_name = [name for name in playlist_name]
#     print(playlist_name)
    words = playlist_name[0].split()
    # wighted_word = words[0]
    # print(playlists.playlist_name.values)
    suggestions = []
    for playlist in playlists.playlist_name.values:
        for i in words:
            if ( Naive_based_search(i, playlist) and cosine_similarity_(playlist_name[0], playlist) > 0.5):
                playlist_id = playlists['playlist_id'].loc[playlists['playlist_name'] == playlist]
                for i in playlist_id:
                    songs = df_merge['song_id'].loc[df_merge['playlist_id'] == i]
                    for i in songs:
                        suggestions.append(i)
            continue
    suggestions = list(dict.fromkeys(suggestions))
    return suggestions   
        
def Naive_based_search(pat, txt): 
    M = len(pat) 
    N = len(txt) 
  
    # A loop to slide pat[] one by one */ 
    for i in range(N - M + 1): 
        j = 0        
        # For current index i, check  
        # for pattern match */ 
        while(j < M): 
            if (txt[i + j] != pat[j]): 
                break
            j += 1
  
        if (j == M):  
            return True
        else:
            return False
 
def word_extraction(sentence):       
    words = re.sub("[^\w]", " ",  sentence).split()    
    cleaned_text = [w.lower() for w in words]    
    return cleaned_text

def tokenize(sentences):   
    words = []    
    for sentence in sentences:        
        w = word_extraction(sentence)        
        words.extend(w)            
        words = sorted(list(set(words)))    
        return words

def generate_bow(allsentences):        
    vocab = tokenize(allsentences)    
    # print("Word List for Document \n{0} \n".format(vocab));
    vector_array = []
    for sentence in allsentences:        
        words = word_extraction(sentence)        
        bag_vector = np.zeros(len(vocab))        
        for w in words:            
            for i,word in enumerate(vocab):                
                if word == w:                     
                    bag_vector[i] += 1                            
#         print("{0}\n{1}\n".format(sentence,np.array(bag_vector)))
        vector_array.append(np.array(bag_vector))
    return vector_array

def cosine_similarity_(sentence1, sentence2):
    allsentences = [sentence1, sentence2]
    vocab = tokenize(allsentences)
    array = generate_bow(allsentences)
    
    feature_vec1 = array[0]
    feature_vec2 = array[1]
    
    c = 0
    
    for i in range(len(vocab)): 
        c+= feature_vec1[i]*feature_vec2[i] 
    cosine = c / float((sum(feature_vec1)*sum(feature_vec2))**0.5) 
    return cosine     

In [223]:
string_matching(104, playlists, df_merge)

[322, 323, 324, 325, 326, 327, 328, 329, 330]

In [None]:
def check_for_seed_tracks(playlist_id, playlists, df_merge):
    no_seed_tracks = len(df_merge.playlist_id.values)
    if no_seed_tracks >= 1:
        cbs = corelations_between_songs(playlist_id, df_merge)
        return cbs
    elif no_seed_tracks == 0:
        strmatching = string_matching(playlist_id, playlists, df_merge)
        return strmatching        

In [226]:
# from difflib import SequenceMatcher 

# def similar(str1, str2): 
#     return SequenceMatcher(None, str1, str2).ratio() 
  
# # Initializing strings 
# test_string1 = 'Christmas Songs Collection'
# test_string2 = 'Iraj Songs Collection '
  
# # using SequenceMatcher.ratio() 
# # similarity between strings 
# res = similar(test_string1, test_string2) 
  
# # printing the result 
# print ("The similarity between 2 strings is : " + str(res)) 

The similarity between 2 strings is : 0.7916666666666666


In [79]:
christmashits = df_merge.loc[df_merge['playlist_id']==104]
christmashits.head()

Unnamed: 0,playlist_id,timestamp,order,song_id,Title,Artist,Artist_id,Album,Release Year
52,104,1396801000.0,1,322,Adara Mage Jesuni,Rookantha Gunathilaka,,,
53,104,1396801000.0,2,323,Ahas Gabe Sura Duwak,Roshan Ranawana,,,
54,104,1396801000.0,3,324,Ahas Thale Nagei Ruwan,Seetha Nanayakkara,,,
55,104,1396801000.0,4,325,Bethleheme Ada Ra Upanna,Ivo Dennis,,,
56,104,1396801000.0,5,326,Hari Asai Man Jesu Amme,Chandani Hettiarachchi,,,


In [80]:
song_name = christmashits.Title.values
song_name_clean = [re.sub(r'[^\w]', ' ', str(item))for item in song_name]
song_name_clean = [re.sub(r" \d+", '', str(item.strip())) for item in song_name_clean]
sentences = list()
for item in song_name_clean:
    sentences.append(item.split())
unique_sentence = np.unique(sentences)
print(unique_sentence)

[list(['Adara', 'Mage', 'Jesuni']) list(['Ahas', 'Gabe', 'Sura', 'Duwak'])
 list(['Ahas', 'Thale', 'Nagei', 'Ruwan'])
 list(['Bethleheme', 'Ada', 'Ra', 'Upanna'])
 list(['Hari', 'Asai', 'Man', 'Jesu', 'Amme'])
 list(['Jesu', 'Bilinda', 'Pabalu', 'Mitaka', 'Pabalu', 'Potaka'])]


In [81]:
bof = []
for i in range (len(unique_sentence)):
    for j in range (len(unique_sentence[i])):
        bof.append(unique_sentence[i][j])
bof
#     print(unique_sentence[i])


['Adara',
 'Mage',
 'Jesuni',
 'Ahas',
 'Gabe',
 'Sura',
 'Duwak',
 'Ahas',
 'Thale',
 'Nagei',
 'Ruwan',
 'Bethleheme',
 'Ada',
 'Ra',
 'Upanna',
 'Hari',
 'Asai',
 'Man',
 'Jesu',
 'Amme',
 'Jesu',
 'Bilinda',
 'Pabalu',
 'Mitaka',
 'Pabalu',
 'Potaka']

In [82]:
model = Word2Vec(workers=1, \
            size=50, min_count = 1, \
            window = 3, sample = 1e-3, sg = 1)
model.build_vocab(sentences = unique_sentence)
model.train(sentences = unique_sentence,  total_examples=len(sentences), epochs=10)
model.init_sims(replace=True)

In [83]:
def avg_sentence_vector(song_id, model, num_features, bof):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    song = songs['Title'].loc[songs['song_id'] == song_id]
    for i in song:
        words = i.split()
    for word in words:
        if word in bof:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [88]:
sentence_1_avg_vector = avg_sentence_vector(326, model, 50, bof)
sentence_2_avg_vector = avg_sentence_vector(322, model, 50, bof)

sen1_sen2_similarity =  cosine_similarity(sentence_1_avg_vector.reshape(1, -1),sentence_2_avg_vector.reshape(1, -1))

  # This is added back by InteractiveShellApp.init_path()


In [89]:
sen1_sen2_similarity

array([[0.25944144]], dtype=float32)

In [91]:
suggestion = []
for i in sen1_sen2_similarity:
    if i >0:
        suggestion.append(326)

In [92]:
suggestion

[326]

In [48]:
featureVec = np.zeros((50,), dtype="float32")
nwords = 0
words = 'Jesu Jesu Oba Innawa'
for word in words.split():
    if word in bof:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])
if nwords>0:
    featureVec = np.divide(featureVec, nwords)
featureVec

  import sys


array([-0.17404573, -0.08310194, -0.15794115,  0.04011351, -0.18104142,
       -0.2023017 ,  0.15160263,  0.16216323, -0.25352368,  0.13920687,
       -0.06056456, -0.13932937,  0.19024621,  0.02643881, -0.03822826,
        0.03730683, -0.02529872,  0.10398544, -0.25690755,  0.14317258,
        0.21698433, -0.11592035,  0.17221488,  0.2573282 ,  0.07583775,
       -0.02905918, -0.22023591,  0.09582233, -0.13842624,  0.15990545,
        0.07964834,  0.05270344,  0.17850854,  0.21490698, -0.10688012,
        0.09963941, -0.19339395, -0.03871592,  0.21963869, -0.0247773 ,
        0.01091725, -0.04160064, -0.05900051,  0.11428913, -0.03796794,
       -0.17467375,  0.10172103, -0.15207782, -0.11435612,  0.10123568],
      dtype=float32)

In [49]:
featureVec2 = np.zeros((50,), dtype="float32")
nwords = 0
words = 'Adara Mage Jesuni'
for word in words.split():
    if word in bof:
            nwords = nwords+1
            featureVec2 = np.add(featureVec2, model[word])
if nwords>0:
    featureVec2 = np.divide(featureVec, nwords)
featureVec2

  import sys


array([-0.05801524, -0.02770065, -0.05264705,  0.01337117, -0.06034714,
       -0.0674339 ,  0.05053421,  0.05405441, -0.08450789,  0.04640229,
       -0.02018819, -0.04644312,  0.0634154 ,  0.00881294, -0.01274275,
        0.01243561, -0.00843291,  0.03466181, -0.08563585,  0.04772419,
        0.07232811, -0.03864012,  0.05740496,  0.08577607,  0.02527925,
       -0.00968639, -0.07341197,  0.03194078, -0.04614208,  0.05330181,
        0.02654945,  0.01756782,  0.05950284,  0.07163566, -0.03562671,
        0.03321313, -0.06446465, -0.01290531,  0.0732129 , -0.0082591 ,
        0.00363908, -0.01386688, -0.01966684,  0.03809638, -0.01265598,
       -0.05822458,  0.03390701, -0.05069261, -0.03811871,  0.03374523],
      dtype=float32)

In [71]:
featureVec3 = np.zeros((50,), dtype="float32")
nwords = 0
words = 'Kithu Saminde'
for word in words.split():
    if word in bof:
            nwords = nwords+1
            featureVec2 = np.add(featureVec2, model[word])
if nwords>0:
    featureVec3 = np.divide(featureVec, nwords)
featureVec3

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [72]:
# featureVec = np.array(featureVec)
# featureVec.shape
# featureVec2 = np.array(featureVec2)
# print(cosine_similarity(featureVec.reshape(-1, 1), featureVec2.reshape(-1, 1)))
sen1_sen2_similarity =  cosine_similarity(featureVec3.reshape(1, -1),featureVec2.reshape(1, -1))

In [73]:
sen1_sen2_similarity

array([[0.]], dtype=float32)

In [None]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in bof:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [30]:
num_features = 50    # Word vector dimensionality                      
min_word_count = 1                      
num_workers = 1      # Number of CPUs
context = 3          # Context window size; 

downsampling = 1e-3   # threshold for configuring which 
                              # higher-frequency words are randomly downsampled

# Initialize and train the model 
model = FastText(unique_sentence, min_count=1,size= 50,workers=3, window =3, sg = 1)
# model = Word2Vec(workers=num_workers, \
#             size=num_features, min_count = min_word_count, \
#             window = context, sample = downsampling, sg = 1)


# model.build_vocab(sentences = unique_sentence)
# model.train(sentences = unique_sentence,  total_examples=len(sentences), epochs=10)
# model.init_sims(replace=True)
# model.save('Fasttext_playlist.model')

# model = FastText.load('Fasttext_playlist.model')

# # split the song title
# music = songs
# song_titles = music.Titles.values

# tokens = song_name.split() 
        
# suggestions = []

#         # check for most similar items form the model
#         suggestions.append(model.wv.most_similar(positive=tokens, topn=10))

#         predictions = []
#         for l in range(len(suggestions[0])):
#             for i in range(len(unique_sentence)):
#                 for j in range(len(unique_sentence[i])):
#                     if unique_sentence[i][j] == suggestions[0][l][0]:
# #                         print(unique_sentence[i])
#                         s = ' '
#                         word = s.join(unique_sentence[i])
# #                         print(word)
#                         predictions.append(word)

#         return predictions

In [31]:
model.most_similar('Jesu')

  """Entry point for launching an IPython kernel.


[('Jesuni', 0.3984375),
 ('Mage', 0.22209984064102173),
 ('Upanna', 0.1727343201637268),
 ('Adara', 0.15519069135189056),
 ('Nagei', 0.15251873433589935),
 ('Asai', 0.12244722992181778),
 ('Ra', 0.12014706432819366),
 ('Ada', 0.11284378916025162),
 ('Sura', 0.11033094674348831),
 ('Pabalu', 0.05470521003007889)]

In [33]:
songs_titles = songs.Title.values
songs_titles

array(['Aa Ra Sulan', 'Aale katha', 'Ada Nam Ma Hada Iwasum Na',
       'Ada Thaniyen Ma Hadanne Na Ma', 'Adambarai Baluwama Nam',
       'Adanne Ay Sudu Manike', 'Adaraneeya Neranjana ', 'Adaraya Ayai',
       'Adare sithum', 'Adarema Geethayak', 'Adaren (Lanwenna Hithuwata)',
       'Aduru kutiya thula ', 'Ae', 'Ahasin eha', 'Ahasin polowata',
       'Ai Kale Adare', 'Ai kale mulu hadinma', 'Akeekaru pem kathawak',
       'Alawanthakam', 'Alen Ma', 'Alen Wela Ganna', 'Amma Sandaki',
       'Anagathaye', 'Ananthayata Yana Para Dige', 'Ananthaye ',
       'Anatha maruthe ', 'Api hagum walata ida dee mohothak',
       'Api kauruda', 'Arabumama Kadulak Wela Ma Bala Iddi',
       'Atha Kadukara Himau Arane',
       'Atha Ran Wiman Thulin Pata Selayen Sadi', 'Athinwath atha',
       'Athithaya Sihinayak Pamanai', 'Athsana',
       'Awado Sansare Ma Ha Badee', 'Awathan hade',
       'Ay Kale Mulu Hadinma Oba Mata Adare', 'Ayage Sinaha',
       'Baila Gamuda', 'Billa', ' Sina Podak Wee', 'Ch

In [None]:
model = gensim.models.Doc2Vec(alpha=0.025, min_alpha=0.025)


In [9]:
artists = pd.read_csv('Copy of explicit_data - Artists - All.csv')

In [10]:
artists.head()

Unnamed: 0,artist_id,artist
0,123,6th Lane
1,63,ajith muthukumarana
2,1,Amarasiri Peris
3,73,Amasha Tissera
4,48,Amila Nidahasa


In [12]:
df_merge.dtypes

playlist_id       int64
timestamp       float64
order             int64
song_id           int64
Title            object
Artist           object
Artist_id       float64
Album            object
Release Year    float64
dtype: object