In [1]:
import os
from tqdm import tqdm
import collections
import json
import io
import distutils.dir_util
import pandas as pd
import numpy as np
import fasttext
import hgtk
from gensim.models.keyedvectors import KeyedVectors
import gensim

In [2]:
# arena_util.py
# -*- coding: utf-8 -*-

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))


def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]

In [3]:
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [4]:
def word_to_jamo(token):
    def to_special_token(jamo):
        if not jamo:
            return '-'
        else:
            return jamo
    
    decomposed_token = ''
    for char in token:
        try:
            # char(음절)을 초성, 중성, 종성으로 분리
            cho, jung, jong = hgtk.letter.decompose(char)
            
            # 자모가 빈 문자일 경우 특수문자 -로 대체
            cho = to_special_token(cho)
            jung = to_special_token(jung)
            jong = to_special_token(jong)
            decomposed_token = decomposed_token + cho + jung + jong
        
        # 만약 char(음절)이 한글이 아닐 경우 자모를 나누지 않고 추가
        except Exception as exception:
            if type(exception).__name__ == 'NotHangulException':
                decomposed_token += char
    # 단어 토큰의 자모 단위 분리 결과를 추가
    return decomposed_token

In [5]:
# 초성, 중성, 종성을 입력받으면 역으로 단어로 바꿔주는 jamo_to_word 함수를 구현
def jamo_to_word(jamo_sequence):
    tokenized_jamo = []
    index = 0
    
    # 1. 초기 입력
    # jamo_sequence = ' ㄴ ㅏ ㅁ ㄷ ㅗ ㅇ ㅅ ㅐ ㅇ '
    
    while index < len(jamo_sequence): # 각 문자에 대해서 세 개씩 분리하여 초성, 중성, 종성을 하나의 묶음으로 간주함
        # 문자가 한글(정상적인 자모)이 아닐 경우
        if not hgtk.checker.is_hangul(jamo_sequence[index]):
            tokenized_jamo.append(jamo_sequence[index])
            index = index + 1
        
        # 문자가 정상적인 자모라면 초성, 중성, 종성을 하나의 토큰으로 간주.
        else:
            tokenized_jamo.append(jamo_sequence[index:index + 3])
            index = index + 3
            
    # 2. 자모 단위 토큰화 완료
    # tokenized_jamo : [' ㄴ ㅏ ㅁ ', ' ㄷ ㅗ ㅇ ', ' ㅅ ㅐ ㅇ ']
    
    word = ''
    try:
        for jamo in tokenized_jamo:
            
            # 초성, 중성, 종성의 묶음으로 추정되는 경우
            if len(jamo) == 3:
                if jamo[2] == "-":
                    # 종성이 존재하지 않는 경우
                    word = word + hgtk.letter.compose(jamo[0], jamo[1])
                else:
                    # 종성 이 존재 하는 경우
                    word = word + hgtk.letter.compose(jamo[0], jamo[1], jamo[2])
            else:
                word = word + jamo
    # 복원 중 (hgtk.letter.compose) 에러 발생 시 초기 입력 리턴.
    # 복원이 불가능한 경우 예시) 'ㄴ ! ㅁ ㄷ ㅗ ㅇ ㅅ ㅐ ㅇ '
    except Exception as exception:
        if type(exception).__name__ == 'NotHangulException':
            return jamo_sequence
    # 3. 단어로 복원 완료
    # word : '남 동 생'
    
    return word

In [6]:
train = pd.read_json('D:/melon_playlist_continuation/arena_data/orig/train.json')
val_q_df = pd.read_json('D:/melon_playlist_continuation/arena_data/questions/val.json')
val_a_df = pd.read_json('D:/melon_playlist_continuation/arena_data/answers/val.json')

In [7]:
train_df = pd.concat([train, val_q_df])
train_df

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000
...,...,...,...,...,...,...
23010,[],132039,이것만 알아도 브릿팝 안다는 소리듣는다.,"[8778, 5725, 234307, 64368, 691112, 581349, 40...",232,2014-03-10 19:08:16.000
23011,[],21475,make015,"[470489, 72225, 387567, 328452, 517417, 690761...",0,2016-05-16 15:34:18.000
23012,"[힘들때, 지칠때, 힘내]",81196,"힘든 걸 알아, 말해주고 싶었어요","[448250, 131741, 531820, 616260, 639943, 13129...",13,2017-12-20 16:32:07.000
23013,[],151600,♥Christmas Music♥,[],8,2015-12-23 09:38:19.000


In [8]:
val_q = load_json("D:/melon_playlist_continuation/arena_data/questions/val.json") # val 데이터의 각 플레이스트를 50%로 분할
val_a = load_json("D:/melon_playlist_continuation/arena_data/answers/val.json") # 정답 데이터

In [9]:
# song 메타 데이터
meta = pd.read_csv('D:/melon_playlist_continuation/res/meta.csv')
song_meta = pd.read_json('D:/melon_playlist_continuation/res/song_meta.json', typ = 'frame')
song_meta

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4
...,...,...,...,...,...,...,...,...,...
707984,[GN2001],19991219,The Best Best Of The Black President,65254,[166499],Coffin For Head Of State,[GN2000],[Fela Kuti],707984
707985,[GN0901],19860000,True Colors,44141,[11837],Change Of Heart,[GN0900],[Cyndi Lauper],707985
707986,"[GN0105, GN0101]",20160120,행보 2015 윤종신 / 작사가 윤종신 Live Part.1,2662866,[437],스치듯 안녕,[GN0100],[윤종신],707986
707987,"[GN1807, GN1801]",20131217,명상의 시간을 위한 뉴에이지 음악,2221722,[729868],숲의 빛,[GN1800],[Nature Piano],707987


## Fasttext model 

In [10]:
ft_model = fasttext.load_model("title2vec(fasttext_jamo).bin")
# ft_model = ft_model.wv
print(ft_model)

<fasttext.FastText._FastText object at 0x000001E8C9CEF1C0>




In [11]:
def transform(word_sequence):
    return [(jamo_to_word(word), similarity) for (similarity, word) in word_sequence]

In [12]:
print(transform(ft_model.get_nearest_neighbors(word_to_jamo('♥♥'), k=10)))

[('㈜', 0.7310441136360168), ('♥♥♥', 0.7139729857444763), ('···', 0.690385103225708), ('^^=', 0.6812024712562561), ('●○○', 0.6719703674316406), (']..', 0.6678409576416016), ('○●', 0.6642767190933228), ('▼▶', 0.6634218692779541), ('^^~', 0.6620190739631653), ('sdf', 0.6606478095054626)]


In [13]:
print(transform(ft_model.get_nearest_neighbors(word_to_jamo('기분전환'), k=10)))

[('드라이브', 0.8980928063392639), ('기분전환곡', 0.8825535774230957), ('</s>', 0.8811877965927124), ('기분전횐', 0.8726389408111572), ('기분전환과', 0.8550788760185242), ('기분전환분', 0.8546332120895386), ('시분전환', 0.8540767431259155), ('스트레스', 0.8481658697128296), ('신나는', 0.8401566743850708), ('기분전환매장음악', 0.8381237983703613)]


In [14]:
print(transform(ft_model.get_nearest_neighbors(word_to_jamo('감성'), k=10)))

[('잔잔한', 0.8690367937088013), ('감섣', 0.8493160009384155), ('새벽감성', 0.8479971885681152), ('새격감성', 0.847298264503479), ('LA감성', 0.8460310697555542), ('감성저격', 0.8450834155082703), ('새벽', 0.8438853621482849), ('밤', 0.8435905575752258), ('감성RNB', 0.8407155871391296), ('감성멜로디', 0.837283730506897)]


In [15]:
print(transform(ft_model.get_nearest_neighbors(word_to_jamo('휴식'), k=10)))

[('힐링', 0.9410766959190369), ('잔잔한', 0.8882978558540344), ('휴일잔잔한', 0.8510009050369263), ('</s>', 0.8373538255691528), ('기분전환', 0.8323042392730713), (':)*', 0.8316049575805664), ('§♥', 0.8300024271011353), ('✨✨', 0.8267654180526733), ('편안한', 0.825541079044342), (':)(:', 0.8169330358505249)]


In [16]:
print(transform(ft_model.get_nearest_neighbors(word_to_jamo('발라드'), k=10)))

[('7080발라드', 0.8921290040016174), ('RnB발라드', 0.8786559104919434), ('감상발라드', 0.8697788119316101), ('봄발라드', 0.8656306266784668), ('발라드돌', 0.8628354668617249), ('발라드_1', 0.8610336780548096), ('8090발라드', 0.8579784035682678), ('이별발라드', 0.8568528890609741), ('가요발라드', 0.8560466170310974), ('롹발라드', 0.852842390537262)]


In [17]:
print(transform(ft_model.get_nearest_neighbors(word_to_jamo('잔잔한'), k=10)))

[('잔잔', 0.8920840620994568), ('휴식', 0.8882982134819031), ('힐링', 0.8843399286270142), ('감성', 0.8690365552902222), ('새벽', 0.8612737655639648), ('휴일잔잔한', 0.8402321934700012), ('잔잔한새벽', 0.8368052840232849), ('✨✨', 0.8326659798622131), ('새벽감성잔잔한', 0.8305585980415344), (':)*', 0.8297936320304871)]


In [18]:
print(transform(ft_model.get_nearest_neighbors(word_to_jamo('드라아브'), k=10)))

[('신나는드라이브', 0.6927951574325562), ('드라브하', 0.6903743147850037), ('셧업앤댄스', 0.6790946125984192), ('신나는', 0.6741485595703125), ('신나던가말던가', 0.6724916100502014), ('신나는POP', 0.6701136827468872), ('분위기띄울때', 0.6682744026184082), ('하우스_댄스', 0.6682590246200562), ('신나는리듬', 0.6662421822547913), ('기분전환댄스', 0.6641401648521423)]


In [19]:
train_json = load_json('../train_df(filtered_title(특수기호 안 없앰)).json')
train_json

[{'tags': ['힐링', '휴식', '밤', '새벽'],
  'id': 147668,
  'plylst_title': 'to. 힘들고 지친 분들에게',
  'songs': [663185,
   649626,
   6855,
   188486,
   348451,
   169945,
   512599,
   532114,
   454528,
   418935,
   124485,
   517372,
   549950,
   540588,
   500931,
   233641,
   331055,
   490266,
   268515,
   531820,
   413762,
   422713,
   215080,
   413189,
   577903,
   352228,
   630395,
   539109,
   152475,
   111865,
   7460,
   72432,
   572480,
   348092,
   324208,
   186039,
   376140,
   270269,
   622615,
   35001,
   444706,
   491303,
   408698,
   325979,
   25538,
   549392,
   473514,
   666814,
   118223,
   697100,
   333034,
   359279,
   421124,
   403253,
   27784,
   118049,
   339124,
   175073,
   522895,
   6925,
   615815,
   672550,
   379112,
   80972,
   227036,
   112153],
  'like_cnt': 12,
  'updt_date': '2016-06-23 10:06:27.000',
  'tokenized_title': ['힘들다', '지치다', '분들'],
  'title_tags': ['힐링', '휴식', '밤', '새벽', '힘들다', '지치다', '분들']},
 {'tags': ['팝'],
  'id

In [20]:
# 데이터셋 만들기
def get_dic(data):
    song_dic = {}
    title_tags_dic = {}
    for q in tqdm(data):
        song_dic[q['id']] = q['songs']
        title_tags_dic[q['id']] = q['title_tags']
    return song_dic, title_tags_dic

In [21]:
song_dic, title_tags_dic = get_dic(train_json)

100%|█████████████████████████████████████████████████████████████████████| 115071/115071 [00:00<00:00, 1402982.32it/s]


In [22]:
title_tags_dic

{147668: ['힐링', '휴식', '밤', '새벽', '힘들다', '지치다', '분들'],
 50422: ['팝', '130807-7'],
 116432: ['뉴에이지', '숙면', '슬프다', '마음', '달래', '피아노'],
 55076: ['하드락', '록스피릿', '댄스', '당신', '하얗다', '불', '태우다', '곡'],
 125064: ['힐링',
  '휴식',
  '기분전환',
  '스피커',
  '필수',
  'hiphop',
  '듣다',
  '꿀렁꿀렁',
  '싶다',
  '힙합',
  '음악',
  '!'],
 5747: ['요즘듣는노래', '요즘', '듣다', '노래', '2'],
 95441: ['카페', '인디', 'tthing', '하루'],
 36913: ['로우파이', '감성', '자기전에듣기좋은', '알앤비힙합', '분위기', 'lofi', 'rnb', '모음'],
 60923: ['댄스',
  'EDM',
  '가을',
  '하늘',
  'edm',
  '함께',
  '!',
  '일',
  '렉',
  '트릭',
  '스카이',
  '뮤직',
  '페스티벌',
  '프리뷰'],
 41197: ['발라드', '발라드', '감성', '재'],
 43789: ['휴식',
  '힐링',
  '여행',
  '산책',
  '♤',
  '겨울',
  '이면',
  '생각',
  '나',
  '늘다',
  '들려오다',
  '노래',
  '들다',
  '♤'],
 23330: ['밤', '까페', '새벽', '한중일', '꿀', '보이스', '삼대', '장'],
 28845: ['발라드', '늦다', '봄', '이별', '후'],
 3019: ['일렉', 'game'],
 34706: ['내한',
  '감성',
  '취향저격',
  '팝',
  '인디팝',
  '밴드',
  '레이니',
  '음색',
  '록',
  'LANY',
  'lany',
  'live',
  'in',
  'seoul',
  '2019',
  '내한

In [23]:
# title_tags로 이루어진 플레이리스트 벡터 생성
def update_p2v(data, model):
    p2v_model = KeyedVectors(100)
    ID = []   
    vec = []
    for q in tqdm(data):
        tmp_vec = 0
        if len(q['title_tags'])>=1:
            for title_tags in q['title_tags']:
                try: 
                    tmp_vec += model[word_to_jamo(title_tags)] # 플레이리스트 안에 있는 곡 벡터를 다 더함
                except KeyError:
                    pass
        if type(tmp_vec)!=int: # 플레이리스트 안에 제목과 태그가 임베딩되면 플레이리스트 임베딩에 추가
            ID.append(q['id'])    
            vec.append(tmp_vec)
    p2v_model.add_vectors(ID, vec)
    return p2v_model

In [24]:
p2v_model = update_p2v(train_json, ft_model)

100%|████████████████████████████████████████████████████████████████████████| 115071/115071 [00:16<00:00, 6982.80it/s]


In [25]:
p2v_model.save('p2v_vectors(fasttext_jamo epochs(10)).kv')

In [26]:
p2v_model[147668]

array([ 1.6844838 , -0.8578702 , -0.6373483 ,  2.6938748 ,  1.2084887 ,
       -4.7462134 ,  2.0679982 , -1.4028711 ,  3.2639327 ,  2.7580867 ,
        0.6646374 , -1.3810099 ,  1.9379464 , -0.20586306,  0.13386358,
        1.7520128 , -3.062738  , -3.658289  , -1.6150994 , -2.0375445 ,
       -2.887646  , -0.49997634,  0.70263726, -0.4002414 , -4.363903  ,
       -2.8765025 ,  0.6550365 ,  0.554855  ,  9.614385  , -2.4246464 ,
       -5.77355   , -5.3090973 , -0.4202993 ,  3.4318104 , -0.8918614 ,
       -0.3005081 , -1.4787362 ,  1.181761  , -2.1070752 , -1.0923753 ,
       -4.295207  , -0.85243773,  0.22150937, -4.5358515 ,  0.56320465,
        1.3983339 , -0.50910294, -1.6693542 ,  1.0940903 ,  0.71832573,
       -0.57905185, -0.3277426 , -2.5568295 ,  2.3967576 , -0.6489626 ,
       -2.0277426 , -0.6960003 ,  1.7948511 , -0.06912716,  0.9393028 ,
        1.8855902 ,  1.4387033 ,  0.6683973 ,  3.4402504 ,  1.3752363 ,
        0.10065079,  2.1032493 ,  0.66247785,  0.07088572, -1.00

## 노래 시드가 없는 플레이리스트 확인

In [27]:
empty_songs = []
for i in range(len(train_df)):
    if len(train_df['songs'].iloc[i]) == 0:
        empty_songs.append(i)

if len(empty_songs) == 0:
    print('빈 데이터 없음')
else:
    print('빈 데이터 있음')

빈 데이터 있음


In [28]:
train_df.iloc[empty_songs]

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
1,[],47308,1년 내내 듣는 좋은 노래,[],5,2018-01-27 15:57:04.000
6,[],122531,당신의 마음을 사로잡을 첼로 클래식,[],173,2017-12-11 10:50:07.000
7,[],141475,'멜로너 PICK!' 스테디 TRENDY POP (매주 업데이트),[],9973,2020-04-20 15:02:50.000
14,[],55946,좋은 꿈 꾸게 해주는 재즈,[],28,2017-07-30 10:47:48.000
16,[],19705,느낌적인 느낌 #2 EDM,[],0,2014-10-08 12:30:49.000
...,...,...,...,...,...,...
22972,[],152490,같이 들어볼래 실내악 작품들,[],0,2020-03-16 16:41:43.000
22973,[],3393,행복한 주말을 장식하는 부드러운 연주곡 모음,[],0,2019-04-29 13:04:30.000
22976,"[강렬한, 2010년대, 운동, 신나는]",18663,몸을 흔들게 하는 아이돌 댄스,[],310,2019-12-24 17:00:22.000
22998,[],74696,신나거나 감성적인 인디밴드.,[],14,2015-10-21 13:27:15.000


In [29]:
[x[0] for x in p2v_model.most_similar(47308, topn=100)]

[147188,
 13240,
 89876,
 46522,
 58230,
 25184,
 11170,
 69811,
 27630,
 148656,
 79282,
 110770,
 5041,
 107174,
 88440,
 66084,
 91758,
 53071,
 150984,
 136057,
 94139,
 5066,
 111140,
 55108,
 149833,
 25776,
 133891,
 115968,
 115746,
 13091,
 50220,
 36069,
 29658,
 81279,
 100125,
 54306,
 141029,
 66421,
 32590,
 54509,
 37354,
 38576,
 141284,
 58854,
 32686,
 134367,
 129593,
 43508,
 108128,
 113426,
 64487,
 118442,
 26292,
 137108,
 36070,
 1524,
 109504,
 6716,
 5956,
 62635,
 84259,
 31506,
 151501,
 9290,
 6974,
 138773,
 53874,
 128984,
 29517,
 123688,
 95564,
 34815,
 11652,
 20415,
 142687,
 3264,
 63126,
 124011,
 114835,
 69047,
 26152,
 23387,
 123981,
 79483,
 150795,
 96430,
 87853,
 79676,
 34746,
 54323,
 92594,
 121312,
 142780,
 5718,
 51804,
 61965,
 58702,
 151862,
 101903,
 48540]

In [30]:
# 1년 내내 듣는 좋은 노래 제목과 비슷한 플레이리스트 확인 (47308)
sample = train_df[train_df['id'].isin([x[0] for x in p2v_model.most_similar(47308, topn=100)])].set_index('id')
sample.loc[[x[0] for x in p2v_model.most_similar(47308, topn=100)]]

Unnamed: 0_level_0,tags,plylst_title,songs,like_cnt,updt_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
147188,"[힐링, 휴식, 기분전환]",요일별로 꺼내 들어요~ 1년 열두달을 노래한 팝과 가요,"[620240, 53892, 658780, 195379, 4564, 164590, ...",8,2015-05-27 18:14:25.000
13240,[기분전환],오랜만에듣는좋은곡,"[263857, 630826, 667123, 407082, 207432, 37020...",6,2016-02-03 21:23:51.000
89876,"[힐링, 추억, 휴식, 회상]",벌써 1년전 노래들,"[589242, 242411, 117470, 117546, 475515, 34820...",7,2015-09-11 21:51:46.000
46522,"[기분전환, 잔잔한]",좋아요 누르게 만드는 노래들,"[25547, 577814, 30028, 499277, 89620, 478730, ...",6,2015-07-15 23:58:36.000
58230,[],내가 듣는 노래,"[90557, 549178, 486705, 260696, 567991, 533410...",2,2015-04-20 01:12:59.000
...,...,...,...,...,...
61965,"[이별, 휴식, 가을]",가을날씨에 듣기 좋은 국내 노래들,"[367963, 120526, 305045, 232210, 218839, 21851...",2,2017-10-06 00:22:36.000
58702,"[추억, 가을, 회상]",가을에 차마시며 듣기 좋은 노래들,"[170206, 102445, 474751, 131909, 614240, 11759...",4,2014-09-02 19:08:03.000
151862,[],몇년 전 듣던 감성돋는 노래들,"[416865, 267452, 358818, 582849, 654764, 26987...",1,2014-08-13 14:17:25.000
101903,[안녕],내가 듣고 싶어 저장한 노래,"[333070, 254313, 671626, 216815, 540378, 25485...",5,2016-07-11 14:07:16.000


## sample recommendation

In [31]:
most_id = [x[0] for x in p2v_model.most_similar(47308, topn=100)]
rec_result = {}
get_song = []
for ID in most_id:
    get_song += song_dic[ID]
get_song = list(pd.value_counts(get_song)[:200].index)
rec_list = remove_seen(val_q_df[val_q_df['id']==47308]['songs'].values[0],get_song)[:100]
rec_result[47308] = rec_list
rec_result

{47308: [90557,
  549178,
  6546,
  643628,
  629738,
  133143,
  650494,
  407828,
  117470,
  449244,
  339802,
  481669,
  253571,
  25885,
  260220,
  675115,
  352313,
  261388,
  13198,
  183949,
  448618,
  153271,
  473570,
  152422,
  170317,
  348200,
  590012,
  346913,
  388487,
  477905,
  671830,
  200877,
  35703,
  699654,
  117546,
  529965,
  404995,
  237407,
  495557,
  567991,
  258919,
  222763,
  2410,
  320530,
  338622,
  400017,
  467615,
  324079,
  616212,
  682642,
  37748,
  59003,
  316497,
  609247,
  18900,
  243850,
  543606,
  448698,
  132117,
  474475,
  51612,
  694507,
  586464,
  508809,
  624299,
  29532,
  9033,
  464051,
  421683,
  567650,
  366786,
  188349,
  304720,
  57060,
  238525,
  453055,
  672550,
  586653,
  697364,
  150544,
  489337,
  610933,
  340679,
  238794,
  232305,
  173943,
  594592,
  595137,
  144663,
  170488,
  616285,
  579592,
  130955,
  358860,
  173686,
  508212,
  340253,
  83497,
  255658,
  248566]}

In [32]:
rec_list = rec_result.get(47308)
rec_list

[90557,
 549178,
 6546,
 643628,
 629738,
 133143,
 650494,
 407828,
 117470,
 449244,
 339802,
 481669,
 253571,
 25885,
 260220,
 675115,
 352313,
 261388,
 13198,
 183949,
 448618,
 153271,
 473570,
 152422,
 170317,
 348200,
 590012,
 346913,
 388487,
 477905,
 671830,
 200877,
 35703,
 699654,
 117546,
 529965,
 404995,
 237407,
 495557,
 567991,
 258919,
 222763,
 2410,
 320530,
 338622,
 400017,
 467615,
 324079,
 616212,
 682642,
 37748,
 59003,
 316497,
 609247,
 18900,
 243850,
 543606,
 448698,
 132117,
 474475,
 51612,
 694507,
 586464,
 508809,
 624299,
 29532,
 9033,
 464051,
 421683,
 567650,
 366786,
 188349,
 304720,
 57060,
 238525,
 453055,
 672550,
 586653,
 697364,
 150544,
 489337,
 610933,
 340679,
 238794,
 232305,
 173943,
 594592,
 595137,
 144663,
 170488,
 616285,
 579592,
 130955,
 358860,
 173686,
 508212,
 340253,
 83497,
 255658,
 248566]

In [33]:
val_a_df[val_a_df.id==47308]

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
1,"[브로콜리너마저리, 주말, 재즈, 드라이브브, 팝송, 차분한분, 조용한한, 인디]",47308,1년 내내 듣는 좋은 노래,"[194051, 511258, 595181, 204818, 91059, 453055...",5,2018-01-27 15:57:04.000


In [34]:
sample_answer = {}
sample_answer[47308] = val_a_df[val_a_df.id==47308].songs.values[0]
sample_answer

{47308: [194051,
  511258,
  595181,
  204818,
  91059,
  453055,
  248043,
  169945,
  512599,
  61159,
  555305,
  445984,
  5970,
  534818,
  339802,
  74131,
  357510,
  478754,
  555338,
  376360,
  75842,
  368069,
  669120,
  621690,
  667394,
  545089,
  134523,
  630395,
  347038,
  624607,
  705445,
  75971,
  520755,
  316742]}

In [35]:
# 추천 결과
rec_df = meta[meta['song_id'].isin(rec_list)].set_index('song_id')
rec_df.loc[rec_list].drop_duplicates(['song_name'],keep='first')

Unnamed: 0_level_0,gnr_code,gnr_name,artist_name_basket,song_name
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
90557,GN0500,인디음악,['어쿠루브'],하고 싶은 말 (Feat. 한올)
549178,GN0100,발라드,['아이유'],금요일에 만나요 (Feat. 장이정 Of HISTORY)
6546,GN1500,OST,['어쿠스틱 콜라보'],"묘해, 너와"
643628,GN0100,발라드,['규현 (KYUHYUN)'],광화문에서 (At Gwanghwamun)
629738,GN0400,R&B/Soul,['브라운 아이즈'],벌써일년
...,...,...,...,...
508212,GN0400,R&B/Soul,['박효신'],바보
340253,GN0100,발라드,['노을'],하지 못한 말
83497,GN1100,일렉트로니카,['Icona Pop'],I Love It (Feat. Charli XCX)
255658,GN0400,R&B/Soul,['박진영'],너뿐이야


In [36]:
# 정답
a_df = meta[meta['song_id'].isin(val_a_df[val_a_df.id==47308].songs.values[0])].set_index('song_id')
a_df.loc[val_a_df[val_a_df.id==47308].songs.values[0]].drop_duplicates(['song_name'],keep='first')

Unnamed: 0_level_0,gnr_code,gnr_name,artist_name_basket,song_name
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
194051,GN0600,록/메탈,['롤러 코스터'],어느 하루
511258,GN0100,발라드,['이문세'],옛사랑
595181,GN1800,뉴에이지,['Acoustic Cafe'],Last Carnival
204818,GN1400,포크/블루스/컨트리,['Don McLean'],Vincent
91059,GN0900,POP,['Connie Francis'],The Wedding Cake
453055,GN0500,인디음악,['뜨거운 감자'],고백
248043,GN0500,인디음악,['짙은'],TV Show
169945,GN0500,인디음악,['브로콜리너마저'],사랑한다는 말로도 위로가 되지 않는
512599,GN0500,인디음악,['브로콜리너마저'],울지마
61159,GN0500,인디음악,['짙은'],백야


In [37]:
evaluate_func = evaluate(recs=rec_result, gt = sample_answer)
evaluate_func._evaluate()

MAP@100: 0.003447790599493386
NDCG@100: 0.04406585370576411
EntDiv@100: 4.605170185988082


## 전체 추천

In [38]:
val_q_df.id

0         75567
1         47308
2         45679
3         88612
4        117860
          ...  
23010    132039
23011     21475
23012     81196
23013    151600
23014      7706
Name: id, Length: 23015, dtype: int64

In [39]:
rec_result = {}
for i in tqdm(val_q_df.id):
    most_id = [x[0] for x in p2v_model.most_similar(i, topn=100)] # 제목과 비슷한 플레이리스트 상위 200개
    get_song = []
    for ID in most_id:
        get_song += song_dic[ID] # 제목과 비슷한 플레이르스 상위 200개 안에 등장한 곡 추출
    get_song = list(pd.value_counts(get_song)[:200].index) # 추출된 곡 중에 등장 횟수가 많은 곡 상위 200개
    # 중복된 곡 제거
    rec_list = remove_seen(val_q_df[val_q_df['id']==i]['songs'].values[0],get_song)[:100]
    rec_result[i] = rec_list
rec_result

100%|███████████████████████████████████████████████████████████████████████████| 23015/23015 [01:37<00:00, 236.80it/s]


{75567: [605317,
  495979,
  630653,
  92908,
  435471,
  386899,
  411147,
  29252,
  6054,
  517821,
  520755,
  321724,
  705515,
  16524,
  669668,
  136567,
  373387,
  596412,
  205251,
  466851,
  222610,
  82525,
  410482,
  165793,
  650367,
  335757,
  346967,
  645517,
  143092,
  390206,
  548623,
  173454,
  108731,
  453587,
  475142,
  264083,
  356141,
  272251,
  422547,
  379162,
  163319,
  125954,
  112823,
  533597,
  139639,
  146449,
  642065,
  230911,
  435258,
  502397,
  573942,
  4122,
  386341,
  45514,
  209838,
  143660,
  85096,
  628274,
  73963,
  697507,
  667356,
  364301,
  204794,
  474777,
  121607,
  55999,
  72881,
  282748,
  698012,
  676391,
  635566,
  237973,
  648813,
  534501,
  438778,
  450261,
  341026,
  58773,
  140951,
  138084,
  293716,
  20303,
  443513,
  477770,
  36623,
  176743,
  2979,
  649722,
  652247,
  84262,
  115256,
  180325,
  588606,
  608594,
  121458,
  428544,
  469288,
  690589,
  38467,
  25628],
 47308: [9055

In [40]:
def df2dict(df):
    dict_ = {}
    for i in df['id']:
        for j in df[df['id']==i].songs:
            dict_[i] = j
    return dict_

In [41]:
# 정답 데이터 딕셔너리 변환
# 정답 dictionary
answer_dict = df2dict(val_a_df)
answer_dict

{75567: [502397,
  568603,
  197626,
  345555,
  68157,
  116877,
  280153,
  565075,
  294480,
  115170],
 47308: [194051,
  511258,
  595181,
  204818,
  91059,
  453055,
  248043,
  169945,
  512599,
  61159,
  555305,
  445984,
  5970,
  534818,
  339802,
  74131,
  357510,
  478754,
  555338,
  376360,
  75842,
  368069,
  669120,
  621690,
  667394,
  545089,
  134523,
  630395,
  347038,
  624607,
  705445,
  75971,
  520755,
  316742],
 45679: [379787, 677020, 337541, 247563],
 88612: [542751,
  488425,
  1839,
  447058,
  688285,
  154095,
  599409,
  149760,
  315393,
  596055,
  23994,
  660956,
  17002,
  25696,
  233461,
  66416,
  532346,
  675773,
  339284,
  67521,
  78958,
  80801,
  639505,
  353562,
  4984,
  4706,
  510395],
 117860: [188132,
  304340,
  567886,
  367551,
  641531,
  506478,
  57784,
  194226,
  166707,
  163974,
  262651,
  513572,
  679061,
  428990,
  61972,
  650100,
  270141,
  345455,
  518555,
  150249,
  27756],
 135083: [318019,
  587299,
 

In [42]:
evaluate_func = evaluate(recs=rec_result, gt = answer_dict)
evaluate_func._evaluate()

MAP@100: 0.0397196485581191
NDCG@100: 0.09029884178245796
EntDiv@100: 9.2391958406074


In [43]:
write_json(rec_result, "fasttext_jamo_rec_result(epoch 10 (top100)).json")