In [1]:
!nvidia-smi

Sun Jul 12 16:07:53 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  On   | 00000000:65:00.0  On |                  N/A |
| 25%   46C    P0    59W / 250W |    782MiB / 11170MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
from utils import load_data
from embedding import get_token_data, get_s2v_model, get_p2v_model

In [93]:
train, val, song_meta = load_data()

In [6]:
import pandas as pd
total = pd.concat([train, val], axis=0)

In [8]:
from konlpy.tag import Komoran
komoran = Komoran

In [9]:
def get_morpher():
    from konlpy.tag import Komoran
    komoran = Komoran()
    return komoran

def get_title_morph(title, morpher):
    return morpher.pos(title)

In [10]:
komoran = get_morpher()

In [18]:
title_morph = [[y[0] for y in get_title_morph(x, komoran)] for x in total['plylst_title']]

In [23]:
from embedding import get_s2v_model
t2v_model = get_s2v_model(title_morph, 2, 100, 3, 5)

In [96]:
t2v_model.most_similar("휴가")

  """Entry point for launching an IPython kernel.


[('여름휴가', 0.8824412226676941),
 ('휴가철', 0.8466458916664124),
 ('바캉스', 0.8345291018486023),
 ('방학', 0.8271308541297913),
 ('막바지', 0.8023832440376282),
 ('기차', 0.7931613326072693),
 ('여름철', 0.7871999144554138),
 ('계획', 0.7841325998306274),
 ('고속도로', 0.7836302518844604),
 ('나기', 0.7806281447410583)]

In [92]:
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
from datetime import datetime
from collections import Counter
import numpy as np

def get_time():
    now = datetime.now()
    return now.isoformat()[:-7]


def get_t2p2v_model(train, val, morpher, w2v_model):
    t2p2v_model = WordEmbeddingsKeyedVectors(100)
    ID = []   
    vec = []
    data = pd.concat([train, val], axis=0)
    for id_, title in zip(data['id'], data['plylst_title']):
        tmp_vec = 0
        title_morph = [y[0] for y in get_title_morph(title, morpher)]
        for token in title_morph:
            try: 
                tmp_vec += w2v_model.wv.get_vector(token)
            except KeyError:
                pass
        if type(tmp_vec)!=int:
            ID.append(str(id_))    
            vec.append(tmp_vec)
    t2p2v_model.add(ID, vec)
    
    file_name = "./manual_emb/t2p2v_mdl_" + get_time() + ".model"
    t2p2v_model.save(file_name)
    
    return t2p2v_model


### token to vector
def get_title_vec(token, t2v_mdl):
    try:
        return t2v_mdl.wv.word_vec(token)
    except:
        return np.array([0.0]*100)
    
### sum of token vector(=playlist vector)
def get_title_2_plylst_vec(token_list, t2v_mdl):
    plylst_vec = None
    for idx_, x in enumerate(token_list):
        if idx_ ==0:
            plylst_vec = get_title_vec(x, t2v_mdl)
        else : 
            plylst_vec =  plylst_vec + get_title_vec(x, t2v_mdl)
    return plylst_vec



### word2vec model loading
def load_t2v_mdl(file_path):
    from gensim.models import Word2Vec 
    return Word2Vec.load(file_path)

def load_p2v_mdl(file_path):
    from gensim.models import Word2Vec, KeyedVectors
    return KeyedVectors.load(file_path, mmap='r')


### MS top20 playlist songlist
def get_MS_song_freq(df, song_meta, plylst_vec, p2v_mdl, topn=20, korean=False):
    MS_plylst = get_most_similar_plylst(plylst_vec, p2v_mdl, topn)
    
    total_songs = [] 
    for plylst in MS_plylst:
        songs = get_songs_from_plylst(df, plylst[0], korean=korean, song_meta=song_meta)
        total_songs += songs
        
    return total_songs

def get_MS_tag_freq(df, plylst_vec, p2v_mdl, topn=20):
    MS_plylst = get_most_similar_plylst(plylst_vec, p2v_mdl, topn)
    
    total_songs = [] 
    for plylst in MS_plylst:
        songs = get_tags_from_plylst(df, plylst[0])
        total_songs += songs
        
    return total_songs


### plylist to similar plylist id
def get_most_similar_plylst(plylst_vec, p2v_mdl, topn = 20):
    return p2v_mdl.similar_by_vector(plylst_vec, topn=topn, restrict_vocab=None)

### plylist id to songs
def get_songs_from_plylst(df, id_, korean=False, song_meta=None):
    if korean == False:
        return list(df[df["id"]==int(id_)]["songs"])[0]
    else :
        songs = list(df[df["id"]==int(id_)]["songs"])
        return [song_meta[song_meta['id']==x]["song_name"].item() for x in songs[0]]
    
def get_tags_from_plylst(df, id_):
     return list(df[df["id"]==int(id_)]["tags"])[0]
    
    
def get_count_list(token_list):
    count_list = sorted(dict(Counter(token_list)).items(), key=(lambda x :x[1]), reverse=True)
    return [x for x in count_list if x[1]>1]

In [81]:
total_df = pd.concat([train, val], axis=0)
total_df = total_df.reset_index()
t2v_file_path = "./manual_emb/t2v_mdl.model"
p2v_file_path = "./manual_emb/t2p2v_mdl_2020-07-12T16:34:48.model"
morpher = Komoran()

def t2p2v_rec_model(title, tags_input, songs_input):
    
    song_count_list = []
    tag_count_list = []
    
    if len(title) !=0:
        
        tmp_list = [y[0] for y in get_title_morph(title, morpher)]
        
        t2v_mdl =load_t2v_mdl(t2v_file_path)
        plylst_vec = get_title_2_plylst_vec(tmp_list, t2v_mdl)

        p2v_mdl = load_p2v_mdl(p2v_file_path)

        song_list = get_MS_song_freq(total_df, song_meta, plylst_vec, p2v_mdl, korean= False, topn=200)
        song_count_list = get_count_list(song_list)

        tag_list = get_MS_tag_freq(total_df, plylst_vec, p2v_mdl, topn=200)
        tag_count_list = get_count_list(tag_list)  
        
        add_songs = max(0, 100-len(songs_input))
        add_tags = max(0, 10-len(tags_input))
        
        if len(song_count_list) <= add_songs:
            pass
        else:
            song_count_list = song_count_list[:add_songs]
            
        song_count_list = [x[0] for x in song_count_list]
        song_count_list += songs_input
        
        if len(tag_count_list) <= add_tags:
            pass
        else:
            tag_count_list = tag_count_list[:add_tags]
            
        tag_count_list = [x[0] for x in tag_count_list]
        tag_count_list += tags_input


    else:
        print("empty tags and songs")
    
    return [song_count_list, tag_count_list]

In [73]:
tmp = val[8:10]

In [74]:
tmp

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
8,[스트레스],80810,리듬타면서 빡시게 운동하자!!!(스트레스 날리자):},[],127,2017-02-09 17:33:45.000
9,[],142007,기분 좋은 재즈와 함께 만드는 달달한 하루,[],0,2015-06-22 09:11:02.000


In [94]:
#tmp["songs_base_result"],  tmp["tags_base_result"]= tmp.apply(lambda x : base_model(x["tags"], x["songs"]), axis=1)
val_result = val.apply(lambda x : t2p2v_rec_model(x["plylst_title"], x["tags"], x['songs']), axis=1)

empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags an

In [95]:
val["tags_result"] = [x[1] for x in val_result]
val["songs_result"] = [x[0] for x in val_result]

val[["id", "tags_result", "songs_result"]].to_csv("t2p2v_rec_model_result.csv", encoding="utf-8")

In [34]:
t2p2v_model = get_t2p2v_model(train, val, komoran, t2v_model)

In [36]:
t2p2v_model.most_similar("98472")

[('137778', 0.9838548302650452),
 ('135488', 0.9838548302650452),
 ('21497', 0.9406653046607971),
 ('43785', 0.938018262386322),
 ('625', 0.938018262386322),
 ('41927', 0.9356130361557007),
 ('11064', 0.9256645441055298),
 ('77110', 0.915657639503479),
 ('57556', 0.9152759313583374),
 ('132034', 0.9139619469642639)]

In [49]:
get_plylst_vec(["오늘", "이별"], t2v_model)

array([ 1.4262930e-02, -7.1537817e-01,  1.9424170e-01, -6.0442638e-01,
        3.9980197e-01, -5.5784130e-01, -3.9083162e-01, -1.7139297e+00,
       -8.7656033e-01, -5.5867672e-01,  1.8468983e+00,  1.4170601e+00,
       -2.8478551e-01,  1.4009356e-01, -6.0289896e-01,  3.2642484e-04,
        3.5222366e-01,  7.4016380e-01,  3.6827549e-02,  4.1576576e-01,
       -2.6199540e-01,  1.1177068e+00,  3.0440351e-01, -7.0728588e-01,
        1.4234791e+00,  2.7775729e-01,  5.8506507e-01,  8.4863591e-01,
       -1.0275800e+00, -3.8436413e-02, -2.0280278e-01, -2.5570810e-02,
        8.8938937e-02, -4.8535126e-01,  9.4772744e-01, -1.7840729e+00,
       -2.3381586e-01,  1.0031682e+00, -3.0049196e-01,  4.7818995e-01,
        3.3736941e-01, -6.2925804e-01,  3.7114125e-01,  8.7305284e-01,
        1.4406080e+00, -4.1139746e-01,  3.0430496e-02, -5.1083761e-01,
       -7.8349817e-01, -8.3371866e-01,  1.2252310e-01, -7.1105018e-02,
       -5.8915079e-01, -5.7077557e-01, -6.7398041e-02, -9.4792354e-01,
      

In [37]:
total[total["id"]==98472]

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
94081,[발라드],98472,▶오늘 헤어졌어요,"[591602, 402387, 682279, 323916, 432810, 21902...",17,2010-09-18 09:44:38.000


In [43]:
total[total["id"]==41927]

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
55149,"[슬픔, 추억, 이별, 회상]",41927,오늘헤어졌어요. 가슴아픈 이별노래,"[501713, 193232, 519403, 281936, 116573, 18573...",35,2019-07-09 08:55:53.000


In [9]:
s2v_model = get_s2v_model(total, 2, 100, 3, 5)

In [10]:
p2v_model = get_p2v_model(train, val, s2v_model)

In [14]:
from gensim.models import Word2Vec 
s2v_model = Word2Vec.load("./manual_emb/s2v_mdl_2020-07-08T21:17:03.model")

In [17]:
from gensim.models import Word2Vec, KeyedVectors
p2v_model = KeyedVectors.load("./manual_emb/p2v_mdl_2020-07-08T21:17:26.model", mmap='r')

In [18]:
p2v_model.most_similar("6362")

[('17095', 0.9961541891098022),
 ('98472', 0.9928523302078247),
 ('86701', 0.9927220344543457),
 ('63269', 0.9918633699417114),
 ('84156', 0.9918058514595032),
 ('79909', 0.9914500713348389),
 ('102725', 0.9913939833641052),
 ('5960', 0.9911588430404663),
 ('96197', 0.9911543130874634),
 ('68932', 0.9907352924346924)]

In [236]:
tags_input = ["추억", "회상"]
songs_input = [394031, 394031] 

In [57]:
tmp_list = get_input_df(tags_input, songs_input)

In [249]:
# get_input_df
import numpy as np
from collections import Counter

### input setting
def get_input_df(tags, songs):
    return tags + [str(x) for x in songs]

### word2vec model loading
def get_s2v_mdl(file_path):
    from gensim.models import Word2Vec 
    return Word2Vec.load(file_path)

def get_p2v_mdl(file_path):
    from gensim.models import Word2Vec, KeyedVectors
    return KeyedVectors.load(file_path, mmap='r')

### token to vector
def get_song_tag_vec(token, s2v_mdl):
    try:
        return s2v_mdl.wv.word_vec(token)
    except:
        return np.array([0.0]*100)
    
### sum of token vector(=playlist vector)
def get_plylst_vec(token_list, s2v_mdl):
    plylst_vec = None
    for idx_, x in enumerate(token_list):
        if idx_ ==0:
            plylst_vec = get_song_tag_vec(x, s2v_mdl)
        else : 
            plylst_vec =  plylst_vec + get_song_tag_vec(x, s2v_mdl)
    return plylst_vec

### plylist to similar plylist id
def get_most_similar_plylst(plylst_vec, p2v_mdl, topn = 20):
    return p2v_mdl.similar_by_vector(plylst_vec, topn=topn, restrict_vocab=None)

### plylist id to songs
def get_songs_from_plylst(df, id_, korean=False, song_meta=None):
    if korean == False:
        return list(df[df["id"]==int(id_)]["songs"])[0]
    else :
        songs = list(df[df["id"]==int(id_)]["songs"])
        return [song_meta[song_meta['id']==x]["song_name"].item() for x in songs[0]]
    
def get_tags_from_plylst(df, id_):
     return list(df[df["id"]==int(id_)]["tags"])[0]

### MS top20 playlist songlist
def get_MS_song_freq(df, song_meta, plylst_vec, p2v_mdl, topn=20, korean=False):
    MS_plylst = get_most_similar_plylst(plylst_vec, p2v_mdl, topn)
    
    total_songs = [] 
    for plylst in MS_plylst:
        songs = get_songs_from_plylst(df, plylst[0], korean=korean, song_meta=song_meta)
        total_songs += songs
        
    return total_songs

def get_MS_tag_freq(df, plylst_vec, p2v_mdl, topn=20):
    MS_plylst = get_most_similar_plylst(plylst_vec, p2v_mdl, topn)
    
    total_songs = [] 
    for plylst in MS_plylst:
        songs = get_tags_from_plylst(df, plylst[0])
        total_songs += songs
        
    return total_songs

def get_count_list(token_list):
    count_list = sorted(dict(Counter(token_list)).items(), key=(lambda x :x[1]), reverse=True)
    return [x for x in count_list if x[1]>1]

In [None]:
s2v_file_path = "./manual_emb/s2v_mdl_2020-07-08T21:17:03.model"
p2v_file_path = "./manual_emb/p2v_mdl_2020-07-08T21:17:26.model"

In [283]:
total_df = pd.concat([train, val], axis=0)
total_df = total_df.reset_index()

def base_model(tags_input, songs_input):

    song_count_list = []
    tag_count_list = []
    
    if len(tags_input + songs_input) !=0:

        tmp_list = get_input_df(tags_input, songs_input)

        s2v_mdl =get_s2v_mdl(s2v_file_path)
        plylst_vec = get_plylst_vec(tmp_list, s2v_mdl)

        p2v_mdl = get_p2v_mdl(p2v_file_path)

        song_list = get_MS_song_freq(total_df, song_meta, plylst_vec, p2v_mdl, korean= False, topn=200)
        song_count_list = get_count_list(song_list)

        tag_list = get_MS_tag_freq(total_df, plylst_vec, p2v_mdl, topn=200)
        tag_count_list = get_count_list(tag_list)  
        
        add_songs = max(0, 100-len(songs_input))
        add_tags = max(0, 10-len(tags_input))
        
        if len(song_count_list) <= add_songs:
            pass
        else:
            song_count_list = song_count_list[:add_songs]
            
        song_count_list = [x[0] for x in song_count_list]
        song_count_list += songs_input
        
        if len(tag_count_list) <= add_tags:
            pass
        else:
            tag_count_list = tag_count_list[:add_tags]
            
        tag_count_list = [x[0] for x in tag_count_list]
        tag_count_list += tags_input


    else:
        print("empty tags and songs")
    
    return [song_count_list, tag_count_list]

In [289]:
#tmp["songs_base_result"],  tmp["tags_base_result"]= tmp.apply(lambda x : base_model(x["tags"], x["songs"]), axis=1)
val_result = val.apply(lambda x : base_model(x["tags"], x["songs"]), axis=1)

empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags and songs
empty tags an

In [290]:
val["tags_result"] = [x[1] for x in val_result] 
val["songs_result"] = [x[0] for x in val_result] 

In [292]:
val[["id", "tags_result", "songs_result"]].to_csv("base_list.csv", encoding="utf-8")

In [299]:
song_meta.tail()

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
707984,[GN2001],19991219,The Best Best Of The Black President,65254,[166499],Coffin For Head Of State,[GN2000],[Fela Kuti],707984
707985,[GN0901],19860000,True Colors,44141,[11837],Change Of Heart,[GN0900],[Cyndi Lauper],707985
707986,"[GN0105, GN0101]",20160120,행보 2015 윤종신 / 작사가 윤종신 Live Part.1,2662866,[437],스치듯 안녕,[GN0100],[윤종신],707986
707987,"[GN1807, GN1801]",20131217,명상의 시간을 위한 뉴에이지 음악,2221722,[729868],숲의 빛,[GN1800],[Nature Piano],707987
707988,"[GN0601, GN0604]",19980000,김경호 Live,34663,[895],Queen 명곡 멜로디,[GN0600],[김경호],707988


In [294]:
len(song_meta)

707989

In [295]:
from konlpy.tag import Komoran

In [296]:
komoran =  Komoran()

In [298]:
komoran.pos("Bach : Partita No. 4 In D Major, BWV 828")

[('Bach', 'SL'),
 (':', 'SP'),
 ('Partita', 'SL'),
 ('No', 'SL'),
 ('.', 'SF'),
 ('4', 'SN'),
 ('In', 'SL'),
 ('D', 'SL'),
 ('Major', 'SL'),
 (',', 'SP'),
 ('BWV', 'SL'),
 ('828', 'SN')]

In [104]:
from collections import Counter
result_dict = dict(Counter(songs_total))
sorted(result_dict.items(), key=(lambda x :x[1]), reverse=True)



[(116847, 14),
 (284814, 9),
 (424869, 8),
 (177284, 8),
 (201145, 8),
 (582905, 8),
 (669617, 8),
 (143652, 7),
 (356571, 7),
 (397924, 7),
 (485957, 7),
 (149779, 7),
 (4412, 7),
 (555134, 6),
 (591047, 6),
 (367459, 6),
 (347303, 6),
 (433447, 6),
 (21865, 6),
 (438086, 6),
 (237282, 6),
 (380859, 6),
 (87752, 5),
 (633203, 5),
 (70662, 5),
 (158554, 5),
 (298646, 5),
 (262145, 5),
 (592386, 5),
 (16847, 5),
 (68266, 4),
 (199390, 4),
 (411756, 4),
 (486220, 4),
 (599290, 4),
 (569548, 4),
 (477964, 4),
 (133330, 4),
 (4083, 4),
 (426013, 4),
 (268811, 4),
 (256465, 4),
 (629177, 4),
 (466048, 4),
 (186358, 4),
 (96825, 4),
 (651339, 4),
 (665144, 4),
 (518239, 3),
 (548735, 3),
 (11914, 3),
 (170578, 3),
 (477689, 3),
 (587193, 3),
 (554558, 3),
 (122384, 3),
 (32042, 3),
 (415774, 3),
 (570100, 3),
 (173709, 3),
 (705462, 3),
 (68488, 3),
 (402661, 3),
 (635160, 3),
 (583525, 3),
 (69933, 3),
 (488423, 3),
 (531732, 3),
 (38311, 3),
 (15958, 3),
 (661213, 3),
 (221986, 3),
 (45421

In [137]:
unused_song = []
new_dic = {}
for i in result_dict.items():
    if str(i[0]) not in songs and i[1]>=2:
        print(str(i[0]))
        new_dic[str(i[0])] = i[1]
        
print(sorted(new_dic.items(), key=(lambda x :x[1]), reverse=True))

11914
170578
411756
486220
367459
70662
16083
38581
477689
347303
590379
284814
599290
23570
177284
569548
480590
433447
230729
356571
158554
201145
21865
397924
153734
587193
285445
675126
554558
298646
477964
122384
547977
582905
262145
143877
188590
133330
319297
592386
122998
32042
142498
438086
579395
4083
415774
395858
669617
237282
485957
1099
8187
699365
570100
638346
173709
622190
149779
705462
380859
68488
4412
546064
402661
426013
635160
583525
274820
69933
485142
444479
488423
184540
268811
531732
256465
292655
317710
192195
94820
21408
320382
429256
127614
695494
629177
466048
16847
128242
38311
65958
15958
661213
186358
221986
543556
454218
414755
134728
134583
46284
589242
96825
28024
132689
653071
651339
38836
349886
675797
677063
85800
688951
665144
602629
539892
524168
374617
63874
503911
355713
235609
678294
565
220781
77781
679405
184015
169935
635759
135756
527887
65062
106438
466004
566376
50055
325001
93307
322776
436255
344327
439190
413422
18768
25892
157566
54

In [130]:
new_dic

[]

In [121]:
len(unused_song)

482

In [123]:
len(result_dict.items())

498

In [122]:
len(songs)

16

In [22]:

def get_p2v_array(input_data):
    total_array = None

    for idx_, token in enumerate(input_data):

        if token in s2v_model.wv.vocab:

            if idx_==0:
                total_array = s2v_model.wv.word_vec(token)
            else:
                total_array = total_array + s2v_model.wv.word_vec(token)

        else:
            pass
        
    return total_array