In [1]:
import os
from tqdm import tqdm
import collections
import json
import io
import distutils.dir_util
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import gensim
import warnings
warnings.filterwarnings("ignore")

In [2]:
# arena_util.py
# -*- coding: utf-8 -*-

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    # distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("../arena_data/results/" + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))


def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]


def most_popular(playlists, col, topk_count):
    c = Counter()

    for doc in playlists:
        c.update(doc[col])

    topk = c.most_common(topk_count)
    return c, [k for k, v in topk]

In [3]:
# 장르 기반 추천
results = load_json("../arena_data/results/genre_results.json")
results

[{'id': 75567,
  'songs': [705515,
   321724,
   335757,
   205939,
   650367,
   154648,
   495979,
   576262,
   165793,
   205251,
   517064,
   92908,
   216931,
   272251,
   517821,
   502397,
   356141,
   222610,
   424754,
   166705,
   6054,
   438778,
   264592,
   231334,
   111933,
   691652,
   679619,
   567046,
   596265,
   381403,
   144078,
   143660,
   411147,
   647421,
   529328,
   605317,
   190881,
   20303,
   85001,
   444796,
   13057,
   55999,
   104716,
   450120,
   113885,
   152394,
   275749,
   232143,
   520755,
   146449,
   29252,
   81484,
   507866,
   226901,
   353443,
   257300,
   311731,
   478504,
   447054,
   568603,
   16277,
   461291,
   300742,
   125954,
   32237,
   596412,
   635579,
   649722,
   695266,
   263199,
   573476,
   345737,
   366989,
   414711,
   517322,
   411273,
   454586,
   267194,
   435471,
   409667,
   200182,
   659377,
   698012,
   508382,
   252803,
   126538,
   533597,
   635566,
   564641,
   14532

## val_q, val_a 파일

In [4]:
val_q = load_json("../arena_data/questions/val.json") # val 데이터의 각 플레이스트를 50%로 분할
val_a = load_json("../arena_data/answers/val.json") # 정답 데이터

In [5]:
val_q

[{'tags': ['로맨틱', '기분전환', '보컬재즈'],
  'id': 75567,
  'plylst_title': '진한 초콜릿처럼 달콤한 재즈보컬',
  'songs': [16641,
   93976,
   681892,
   481989,
   12662,
   430071,
   104363,
   158296,
   4298,
   642204],
  'like_cnt': 6,
  'updt_date': '2019-03-18 13:56:41.000'},
 {'tags': [],
  'id': 47308,
  'plylst_title': ' 1년 내내 듣는 좋은 노래',
  'songs': [],
  'like_cnt': 5,
  'updt_date': '2018-01-27 15:57:04.000'},
 {'tags': [],
  'id': 45679,
  'plylst_title': '모진이네 방(팝)',
  'songs': [690947, 156049, 18368, 135346],
  'like_cnt': 5,
  'updt_date': '2009-06-07 11:30:36.000'},
 {'tags': [],
  'id': 88612,
  'plylst_title': '처음부터 끝까지 다 좋아! 나만 알고싶은 팝송 노래들 :-)',
  'songs': [207390,
   112997,
   391057,
   138483,
   655745,
   684639,
   350034,
   331171,
   23381,
   391137,
   541764,
   121586,
   323722,
   185196,
   233927,
   552464,
   243579,
   680360,
   477501,
   506712,
   177414,
   256658,
   487867,
   67371,
   426365,
   47202],
  'like_cnt': 158,
  'updt_date': '2019-09-21 14:22:34

In [6]:
val_a

[{'tags': ['카페음악', '재즈', '보컬'],
  'id': 75567,
  'plylst_title': '진한 초콜릿처럼 달콤한 재즈보컬',
  'songs': [502397,
   568603,
   197626,
   345555,
   68157,
   116877,
   280153,
   565075,
   294480,
   115170],
  'like_cnt': 6,
  'updt_date': '2019-03-18 13:56:41.000'},
 {'tags': ['브로콜리너마저리', '주말', '재즈', '드라이브브', '팝송', '차분한분', '조용한한', '인디'],
  'id': 47308,
  'plylst_title': ' 1년 내내 듣는 좋은 노래',
  'songs': [194051,
   511258,
   595181,
   204818,
   91059,
   453055,
   248043,
   169945,
   512599,
   61159,
   555305,
   445984,
   5970,
   534818,
   339802,
   74131,
   357510,
   478754,
   555338,
   376360,
   75842,
   368069,
   669120,
   621690,
   667394,
   545089,
   134523,
   630395,
   347038,
   624607,
   705445,
   75971,
   520755,
   316742],
  'like_cnt': 5,
  'updt_date': '2018-01-27 15:57:04.000'},
 {'tags': ['팝'],
  'id': 45679,
  'plylst_title': '모진이네 방(팝)',
  'songs': [379787, 677020, 337541, 247563],
  'like_cnt': 5,
  'updt_date': '2009-06-07 11:30:36.000'},
 {'ta

## Evaluate

In [7]:
class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)

        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)

## 메타 csv 파일

In [8]:
song_meta = pd.read_json('D:/melon_playlist_continuation/res/song_meta.json', typ = 'frame')
song_meta

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4
...,...,...,...,...,...,...,...,...,...
707984,[GN2001],19991219,The Best Best Of The Black President,65254,[166499],Coffin For Head Of State,[GN2000],[Fela Kuti],707984
707985,[GN0901],19860000,True Colors,44141,[11837],Change Of Heart,[GN0900],[Cyndi Lauper],707985
707986,"[GN0105, GN0101]",20160120,행보 2015 윤종신 / 작사가 윤종신 Live Part.1,2662866,[437],스치듯 안녕,[GN0100],[윤종신],707986
707987,"[GN1807, GN1801]",20131217,명상의 시간을 위한 뉴에이지 음악,2221722,[729868],숲의 빛,[GN1800],[Nature Piano],707987


In [9]:
meta = pd.read_csv('D:/melon_playlist_continuation/res/meta.csv')
meta

Unnamed: 0,song_id,gnr_code,gnr_name,artist_name_basket,song_name
0,0,GN0900,POP,['Various Artists'],Feelings
1,1,GN1600,클래식,['Murray Perahia'],"Bach : Partita No. 4 In D Major, BWV 828 - II...."
2,2,GN0900,POP,['Peter Gabriel'],Solsbury Hill (Remastered 2002)
3,3,GN1100,일렉트로니카,['Matoma'],Feeling Right (Everything Is Nice) (Feat. Popc...
4,4,GN1800,뉴에이지,['Jude Law'],그남자 그여자
...,...,...,...,...,...
802854,707984,GN2000,월드뮤직,['Fela Kuti'],Coffin For Head Of State
802855,707985,GN0900,POP,['Cyndi Lauper'],Change Of Heart
802856,707986,GN0100,발라드,['윤종신'],스치듯 안녕
802857,707987,GN1800,뉴에이지,['Nature Piano'],숲의 빛


In [10]:
meta['song_id']

0              0
1              1
2              2
3              3
4              4
           ...  
802854    707984
802855    707985
802856    707986
802857    707987
802858    707988
Name: song_id, Length: 802859, dtype: int64

## Word2Vec 기반 추천

In [11]:
s2v_model = Word2Vec.load("song2vec.model")
s2v_model = s2v_model.wv
print(s2v_model)

KeyedVectors<vector_size=100, 576124 keys>


In [12]:
val_q[15]

{'tags': [],
 'id': 145150,
 'plylst_title': '이태원 클라쓰 OST 총집합',
 'songs': [671563, 568080, 574349, 80447, 206399, 310494, 98509],
 'like_cnt': 57,
 'updt_date': '2020-03-15 20:56:48.000'}

In [13]:
q_list = val_q[15]['songs']
q_list

[671563, 568080, 574349, 80447, 206399, 310494, 98509]

In [14]:
q_df = meta[meta['song_id'].isin(q_list)].set_index('song_id')
q_df.loc[q_list].drop_duplicates(['song_name'], keep='first')

Unnamed: 0_level_0,gnr_code,gnr_name,artist_name_basket,song_name
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
671563,GN1500,OST,['가호 (Gaho)'],시작
568080,GN1500,OST,['하현우 (국카스텐)'],돌덩이
574349,GN1500,OST,['Sondia'],Maybe
80447,GN2500,아이돌,['VERIVERY'],With Us
206399,GN1500,OST,['더 베인'],직진
310494,GN1500,OST,['Crush'],어떤 말도
98509,GN2500,아이돌,['V'],Sweet Night


In [15]:
val_a[15]

{'tags': ['조이서', '박새로이', '이태원클라쓰ost'],
 'id': 145150,
 'plylst_title': '이태원 클라쓰 OST 총집합',
 'songs': [506665, 480352, 164683, 296047, 205271, 414639, 254090],
 'like_cnt': 57,
 'updt_date': '2020-03-15 20:56:48.000'}

In [16]:
# list(map(str, q['songs'])
a_list = val_a[15]['songs']
a_list

[506665, 480352, 164683, 296047, 205271, 414639, 254090]

In [17]:
a_df = meta[meta['song_id'].isin(a_list)].set_index('song_id')
a_df.loc[a_list].drop_duplicates(['song_name'], keep='first')

Unnamed: 0_level_0,gnr_code,gnr_name,artist_name_basket,song_name
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
506665,GN1500,OST,['이찬솔'],Still Fighting It
480352,GN1500,OST,['Sondia'],우리의 밤
164683,GN1500,OST,['김우성 (The Rose)'],You Make Me Back
296047,GN1500,OST,['김필'],그때 그 아인
205271,GN1500,OST,['Sondia'],우린 친구뿐일까
414639,GN1500,OST,"['박성일', 'Fraktal']",Defence
254090,GN1500,OST,['윤미래'],Say


In [18]:
q_list

[671563, 568080, 574349, 80447, 206399, 310494, 98509]

In [19]:
topn = 250
positive_list = q_list
similar_song = s2v_model.most_similar(positive=positive_list, topn=topn)

In [20]:
similar_song

[(164683, 0.9292504787445068),
 (506665, 0.9200552701950073),
 (194132, 0.9117345809936523),
 (414639, 0.9101796746253967),
 (296047, 0.9096884727478027),
 (480352, 0.893355667591095),
 (353209, 0.8893715143203735),
 (543609, 0.8874928951263428),
 (119920, 0.8857824802398682),
 (668448, 0.8857715725898743),
 (224541, 0.8844568729400635),
 (336942, 0.8842150568962097),
 (146623, 0.8812171220779419),
 (113313, 0.8804870843887329),
 (41275, 0.8804262280464172),
 (612750, 0.8803590536117554),
 (36680, 0.8803064823150635),
 (134937, 0.8799260258674622),
 (132274, 0.8798890113830566),
 (297118, 0.879626989364624),
 (445613, 0.8795228004455566),
 (338824, 0.8793085217475891),
 (397213, 0.8792327046394348),
 (634583, 0.8791577816009521),
 (477298, 0.8789722919464111),
 (471746, 0.878024697303772),
 (633830, 0.8780097365379333),
 (257777, 0.8778691291809082),
 (649288, 0.8777303695678711),
 (326149, 0.8775144219398499),
 (210154, 0.8774664402008057),
 (431215, 0.8774554133415222),
 (573215, 0.8

In [21]:
[song_id for song_id, score in similar_song if type(song_id)==int][:100]

[164683,
 506665,
 194132,
 414639,
 296047,
 480352,
 353209,
 543609,
 119920,
 668448,
 224541,
 336942,
 146623,
 113313,
 41275,
 612750,
 36680,
 134937,
 132274,
 297118,
 445613,
 338824,
 397213,
 634583,
 477298,
 471746,
 633830,
 257777,
 649288,
 326149,
 210154,
 431215,
 573215,
 240777,
 498040,
 172291,
 92977,
 608050,
 91460,
 242439,
 205271,
 396278,
 329306,
 160383,
 197141,
 469869,
 347012,
 34641,
 175525,
 207793,
 586663,
 199532,
 180001,
 498059,
 259016,
 444989,
 16284,
 609567,
 285649,
 533251,
 366620,
 636535,
 376404,
 133437,
 671779,
 166139,
 583605,
 459204,
 689782,
 510550,
 437049,
 94866,
 33072,
 443891,
 251271,
 144290,
 54019,
 20071,
 479035,
 704340,
 692507,
 561324,
 158870,
 480724,
 705390,
 104617,
 222240,
 323813,
 601199,
 254090,
 73455,
 215803,
 114409,
 432882,
 216351,
 668207,
 554051,
 471227,
 8488,
 632928]

In [22]:
similar_song_list = [song_id for song_id, score in similar_song if type(song_id)==int][:100]
similar_song_df = song_meta[song_meta['id'].isin(similar_song_list)].set_index('id').loc[similar_song_list]
similar_song_df.drop_duplicates('song_name', keep='last')

Unnamed: 0_level_0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
194132,"[GN0105, GN1501, GN0101, GN1504]",20200302,날씨가 좋으면 찾아가겠어요 OST Part.2,10396689,[265067],시간의 문,"[GN1500, GN0100]",[정엽]
480352,"[GN0105, GN1501, GN0101, GN1504]",20200208,이태원 클라쓰 OST Part.4,10385902,[1023977],우리의 밤,"[GN1500, GN0100]",[Sondia]
353209,[],20200320,이태원 클라쓰 OST,10406113,[2860620],Defence,[],[Studio Curiosity]
543609,[],20200320,이태원 클라쓰 OST,10406113,[1866266],You Make Me Back,[],[김우성 (The Rose)]
119920,"[GN1501, GN1504]",20200313,하이에나 OST Part.5,10402582,[46751],FREAK (English Ver.),[GN1500],[이보람]
...,...,...,...,...,...,...,...,...
668207,[GN0401],20200227,Yearbook 2019,10394446,"[100148, 663878]",Murky Time,[GN0400],"[015B, 최예근]"
554051,[],20200320,이태원 클라쓰 OST,10406113,[2860620],이 남자의 밤,[],[Studio Curiosity]
471227,"[GN1501, GN1504]",20200215,간택 - 여인들의 전쟁 OST,10388456,[1627442],이렇게 돌아서면 안돼요,"[GN1500, GN0100]",[체리베리 (CherryBerry)]
8488,"[GN1501, GN1504]",20200215,드라마 &#39;터치&#39; OST,10388587,[2762956],For You,"[GN1500, GN0100]",[김소임]


In [23]:
evaluator = ArenaEvaluator()
evaluator._ndcg(a_list, similar_song_list)

0.8641499525490947

# 전체 추천

In [24]:
train = load_json("D:/melon_playlist_continuation/arena_data/orig/train.json")
val = load_json("D:/melon_playlist_continuation/arena_data/orig/val.json")
# val_q = load_json("../arena_data/questions/val.json") # val 데이터의 각 플레이스트를 50%로 분할
# val_a = load_json("../arena_data/answers/val.json") # 정답 데이터
most_results = load_json("D:/melon_playlist_continuation/arena_data/results/genre_results.json")

In [25]:
# 데이터셋 만들기
def get_dic(train, val):
    song_dic = {}
    tag_dic = {}
    data = train + val
    for q in tqdm(data):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags']
#     self.song_dic = song_dic
#     self.tag_dic = tag_dic
    total = list(map(lambda x: list(x['songs']) + list(x['tags']), data))
    total = [x for x in total if len(x)>1]
    return song_dic, tag_dic, total

In [26]:
song_dic, tag_dic, total = get_dic(train, val_q)

100%|█████████████████████████████████████████████████████████████████████| 115071/115071 [00:00<00:00, 1018334.64it/s]


In [27]:
song_dic

{'147668': [663185,
  649626,
  6855,
  188486,
  348451,
  169945,
  512599,
  532114,
  454528,
  418935,
  124485,
  517372,
  549950,
  540588,
  500931,
  233641,
  331055,
  490266,
  268515,
  531820,
  413762,
  422713,
  215080,
  413189,
  577903,
  352228,
  630395,
  539109,
  152475,
  111865,
  7460,
  72432,
  572480,
  348092,
  324208,
  186039,
  376140,
  270269,
  622615,
  35001,
  444706,
  491303,
  408698,
  325979,
  25538,
  549392,
  473514,
  666814,
  118223,
  697100,
  333034,
  359279,
  421124,
  403253,
  27784,
  118049,
  339124,
  175073,
  522895,
  6925,
  615815,
  672550,
  379112,
  80972,
  227036,
  112153],
 '50422': [627035,
  256438,
  603324,
  200889,
  441319,
  216892,
  398848,
  477226,
  686809,
  397131,
  337151,
  439874,
  363226,
  256479],
 '116432': [129204,
  369497,
  649743,
  344619,
  110281,
  632662,
  32123,
  14636,
  545493,
  428001,
  406581,
  121077,
  589420,
  181312,
  40782],
 '55076': [677591,
  420396,
  1

In [28]:
tag_dic

{'147668': ['힐링', '휴식', '밤', '새벽'],
 '50422': ['팝'],
 '116432': ['뉴에이지'],
 '55076': ['하드락', '록스피릿', '댄스'],
 '125064': ['힐링', '휴식', '기분전환'],
 '5747': ['요즘듣는노래'],
 '95441': ['카페', '인디'],
 '36913': ['로우파이', '감성', '자기전에듣기좋은', '알앤비힙합'],
 '60923': ['댄스', 'EDM'],
 '41197': ['발라드'],
 '43789': ['휴식', '힐링', '여행', '산책'],
 '23330': ['밤', '까페', '새벽'],
 '28845': ['발라드'],
 '3019': ['일렉'],
 '34706': ['내한', '감성', '취향저격', '팝', '인디팝', '밴드', '레이니', '음색', '록', 'LANY'],
 '20353': ['카페', '피아노', '뉴에이지', '봄', '재즈'],
 '97508': ['슬픔', '이별', '설렘', '사랑'],
 '20064': ['감성', '밤', '알앤비'],
 '89500': ['몽롱',
  'Rock',
  '명곡',
  '락발라드',
  '해외락',
  '기분전환',
  'Soft_Rock',
  '락음악',
  '부드러운',
  '감성락'],
 '35350': ['히피'],
 '6004': ['발라드'],
 '109952': ['기분전환', '여행', '드라이브', '신나는'],
 '89216': ['아침', '운동', '드라이브', '여행', '기분전환', '스트레스', '신나는'],
 '43481': ['설렘', '사랑'],
 '124000': ['휴식', '힐링', '설렘', '사랑'],
 '38271': ['록메탈', 'Pop'],
 '42681': ['크리스마스', '재즈'],
 '42347': ['발라드'],
 '101751': ['잔잔한', '감성', '밤', '새벽', '취향저격', '발라드', '사랑'],

In [29]:
def update_p2v(train, val, w2v_model):
    p2v_model = KeyedVectors(100)
    ID = []   
    vec = []
    for q in tqdm(train + val):
        tmp_vec = 0
        if len(q['songs'])>=1:
            for song in q['songs']:
                try: 
                    tmp_vec += w2v_model.get_vector(song) # 플레이리스트 안에 있는 곡 벡터를 다 더함
                except KeyError:
                    pass
        if type(tmp_vec)!=int: # 플레이리스트 안에 곡이 임베딩되면 플레이리스트 임베딩에 추가
            ID.append(str(q['id']))    
            vec.append(tmp_vec)
    p2v_model.add_vectors(ID, vec)
    return p2v_model

In [30]:
p2v_model = update_p2v(train, val_q, s2v_model)

100%|███████████████████████████████████████████████████████████████████████| 115071/115071 [00:07<00:00, 16382.21it/s]


In [31]:
train

[{'tags': ['힐링', '휴식', '밤', '새벽'],
  'id': 147668,
  'plylst_title': 'To. 힘들고 지친 분들에게',
  'songs': [663185,
   649626,
   6855,
   188486,
   348451,
   169945,
   512599,
   532114,
   454528,
   418935,
   124485,
   517372,
   549950,
   540588,
   500931,
   233641,
   331055,
   490266,
   268515,
   531820,
   413762,
   422713,
   215080,
   413189,
   577903,
   352228,
   630395,
   539109,
   152475,
   111865,
   7460,
   72432,
   572480,
   348092,
   324208,
   186039,
   376140,
   270269,
   622615,
   35001,
   444706,
   491303,
   408698,
   325979,
   25538,
   549392,
   473514,
   666814,
   118223,
   697100,
   333034,
   359279,
   421124,
   403253,
   27784,
   118049,
   339124,
   175073,
   522895,
   6925,
   615815,
   672550,
   379112,
   80972,
   227036,
   112153],
  'like_cnt': 12,
  'updt_date': '2016-06-23 10:06:27.000'},
 {'tags': ['팝'],
  'id': 50422,
  'plylst_title': '130807-7',
  'songs': [627035,
   256438,
   603324,
   200889,
   441319,


In [32]:
p2v_model['147668']

array([-11.576088  ,  24.381552  ,   0.10528946,  -9.622889  ,
       -15.108481  ,   0.61264426, -13.112927  ,  37.628998  ,
        12.462299  , -27.288605  , -52.598152  , -49.133743  ,
        21.480146  ,  -6.686712  ,  -6.075652  , -17.695211  ,
       -59.87676   , -13.960207  ,  11.085902  ,  30.372698  ,
        -3.8226593 ,  -9.049601  ,  40.794464  ,  19.859266  ,
        -7.7476807 , -19.308197  ,  18.202553  ,  -8.750472  ,
         5.0270195 ,  12.436199  ,  29.956387  ,  -0.67459905,
       -10.004473  ,  25.825489  , -32.05513   ,  28.056665  ,
         7.383151  ,   1.1549448 ,  -9.9628935 ,  10.596948  ,
        29.608868  , -29.705307  ,   5.973928  ,  -1.2710578 ,
         1.4463532 , -12.306611  ,  -6.093837  ,   2.974916  ,
        24.091404  ,  25.675406  , -26.125843  , -11.6440935 ,
       -12.454374  , -11.837076  , -25.809643  ,  -8.515902  ,
       -37.916756  ,   8.935354  ,  -8.772457  ,  14.346536  ,
        15.347422  ,  14.126703  , -37.20977   , -10.84

In [33]:
p2v_model.most_similar('114575', topn=200)

[('99015', 0.9633772969245911),
 ('152593', 0.9279089570045471),
 ('92499', 0.9229923486709595),
 ('36146', 0.9118280410766602),
 ('44030', 0.9109042882919312),
 ('50288', 0.9077237248420715),
 ('103001', 0.9063662886619568),
 ('104955', 0.9056950807571411),
 ('123191', 0.9041216969490051),
 ('46295', 0.9027281999588013),
 ('134295', 0.9019865393638611),
 ('96191', 0.901592493057251),
 ('10472', 0.9015558958053589),
 ('45229', 0.9008762240409851),
 ('123900', 0.9002885222434998),
 ('48420', 0.9001287817955017),
 ('1309', 0.8993522524833679),
 ('44114', 0.8987641334533691),
 ('5931', 0.8985286951065063),
 ('33503', 0.8981605768203735),
 ('39423', 0.8973594307899475),
 ('116323', 0.8973559141159058),
 ('40355', 0.8973527550697327),
 ('21218', 0.8973256945610046),
 ('29143', 0.8973255753517151),
 ('55918', 0.8949992656707764),
 ('28311', 0.8948661088943481),
 ('39352', 0.8945904970169067),
 ('54195', 0.8945444226264954),
 ('132150', 0.8941131234169006),
 ('94143', 0.8939981460571289),
 ('

In [34]:
def get_result(p2v_model, song_dic, tag_dic, most_results, val):
    answers = []
    for n, q in tqdm(enumerate(val), total = len(val)):
        try:
            most_id = [x[0] for x in p2v_model.most_similar(str(q['id']), topn=100)] # 유사한 플레이리스트 ID
            get_song = []
            get_tag = []
            for ID in most_id:
                get_song += song_dic[ID]
                get_tag += tag_dic[ID]
            get_song = list(pd.value_counts(get_song)[:200].index) # 유사한 플레이리스트 안에 가장 많이 등장한 곡 
            get_tag = list(pd.value_counts(get_tag)[:20].index) # 유사한 플레이스트 안에 가장 많이 등장한 태그
            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], get_song)[:100],
                "tags": remove_seen(q["tags"], get_tag)[:10],
            })
        except:
            most_dic = next((item for item in most_results if item['id'] == q["id"]), False)
            answers.append({
                "id": q["id"],
                "songs": most_dic['songs'],
                "tags": most_dic["tags"],
            }) 
        # check and update answer
    for n, q in enumerate(answers):
        if len(q['songs'])!=100:
            answers[n]['songs'] += remove_seen(q['songs'], most_results[n]['songs'])[:100-len(q['songs'])]
        if len(q['tags'])!=10:
            answers[n]['tags'] += remove_seen(q['tags'], most_results[n]['tags'])[:10-len(q['tags'])]  
    return answers
    # self.answers = answers

In [35]:
answers = get_result(p2v_model,song_dic, tag_dic, most_results, val_q)

100%|███████████████████████████████████████████████████████████████████████████| 23015/23015 [01:35<00:00, 241.19it/s]


## Evaluate

In [36]:
write_json(answers, "s2v_results (songs only).json")

In [37]:
evaluator.evaluate("../arena_data/answers/val.json", "../arena_data/results/s2v_results (songs only).json")

Music nDCG: 0.178413
Tag nDCG: 0.406895
Score: 0.212686


In [38]:
val_q = load_json("../arena_data/answers/val.json")
val_q[:50]

[{'tags': ['카페음악', '재즈', '보컬'],
  'id': 75567,
  'plylst_title': '진한 초콜릿처럼 달콤한 재즈보컬',
  'songs': [502397,
   568603,
   197626,
   345555,
   68157,
   116877,
   280153,
   565075,
   294480,
   115170],
  'like_cnt': 6,
  'updt_date': '2019-03-18 13:56:41.000'},
 {'tags': ['브로콜리너마저리', '주말', '재즈', '드라이브브', '팝송', '차분한분', '조용한한', '인디'],
  'id': 47308,
  'plylst_title': ' 1년 내내 듣는 좋은 노래',
  'songs': [194051,
   511258,
   595181,
   204818,
   91059,
   453055,
   248043,
   169945,
   512599,
   61159,
   555305,
   445984,
   5970,
   534818,
   339802,
   74131,
   357510,
   478754,
   555338,
   376360,
   75842,
   368069,
   669120,
   621690,
   667394,
   545089,
   134523,
   630395,
   347038,
   624607,
   705445,
   75971,
   520755,
   316742],
  'like_cnt': 5,
  'updt_date': '2018-01-27 15:57:04.000'},
 {'tags': ['팝'],
  'id': 45679,
  'plylst_title': '모진이네 방(팝)',
  'songs': [379787, 677020, 337541, 247563],
  'like_cnt': 5,
  'updt_date': '2009-06-07 11:30:36.000'},
 {'ta

In [39]:
val_q[56]

{'tags': ['감성', '발라드', '슬픔', '인디', '새벽'],
 'id': 39565,
 'plylst_title': '네 생각 끝자락을 쥐고 듣는 몽그러운 밤수성',
 'songs': [465914,
  640993,
  337962,
  333143,
  358492,
  254664,
  15418,
  110014,
  564603,
  91162,
  382632,
  105383,
  18100,
  234953,
  84296,
  191723,
  403279,
  494759,
  53506,
  360099,
  127443,
  432146,
  323937,
  552651,
  353473,
  589974,
  576085,
  4975,
  396924,
  317833,
  70851,
  493864,
  481802,
  292823,
  434717,
  370191,
  305678,
  595427,
  255047,
  115222,
  392443,
  27441,
  647734,
  705356,
  422111,
  641041,
  534615,
  146609,
  110232,
  584978,
  478793,
  280614,
  478287],
 'like_cnt': 10,
 'updt_date': '2019-11-30 19:57:16.000'}

In [40]:
q_list = val_q[56]['songs']
q_df = song_meta[song_meta['id'].isin(q_list)].set_index('id')
q_df.loc[q_list].drop_duplicates('song_name', keep='last')

Unnamed: 0_level_0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
465914,"[GN0105, GN0101, GN2505, GN2503, GN1501, GN250...",20130613,여왕의 교실 OST `두 번째 서랍`,2188763,[257449],두번째 서랍 (The 2nd Drawer),"[GN2500, GN1500, GN0100]",[써니 (SUNNY)]
640993,"[GN1501, GN1504, GN0303, GN0301]",20150624,가면 OST Part.4,2325826,"[730650, 27652]",죽은 듯이 지낼게,"[GN1500, GN0300]","[챈슬러 (Chancellor), BILL STAX (빌스택스)]"
337962,"[GN0401, GN0403, GN0402]",20151022,무표정,2646047,[880881],무표정,[GN0400],[레트로펑키]
333143,"[GN0509, GN0502, GN0801, GN0501]",20151102,기쁨이 돼줄게,2647953,[782798],기쁨이 돼줄게,"[GN0500, GN0800]",[피그말리온]
358492,"[GN0105, GN1501, GN0101, GN1504]",20160317,돌아와요 아저씨 OST Part.4,2673644,[629831],사랑이니까,"[GN1500, GN0100]",[에일리]
254664,"[GN0105, GN2505, GN2501, GN0101, GN2503]",20160601,STREET,2688323,[630454],"여름, 가을, 겨울, 봄","[GN2500, GN0100]",[EXID]
15418,"[GN0105, GN0101]",20160701,COLOR,2694960,[693951],비가 내리면 (Feat. 라비 Of 빅스),[GN0100],[멜로디데이 (MelodyDay)]
110014,"[GN0303, GN0301]",20160723,네가 보여,2699636,[735541],네가 보여 (Feat. JUNIK),[GN0300],[딘딘]
564603,"[GN0303, GN0301]",20160908,감정기복 II Part.3 : 심리치료 (Psychotherapy),2709981,[236970],24,[GN0300],[스윙스]
91162,"[GN0401, GN0403]",20161121,Talking To The Moon,10016593,[744140],선물,[GN0400],[KREAM (크림)]


In [42]:
s2v_results = load_json("../arena_data/results/s2v_results (songs only).json")

In [43]:
s2v_results[56]

{'id': 39565,
 'songs': [144663,
  116573,
  357367,
  366786,
  654757,
  133143,
  675115,
  349492,
  463173,
  396828,
  42155,
  461341,
  174749,
  701557,
  610933,
  520093,
  13281,
  418935,
  449244,
  650494,
  680366,
  485155,
  549178,
  11657,
  169984,
  523521,
  648628,
  422915,
  187047,
  547967,
  422077,
  625875,
  350309,
  215411,
  442014,
  132994,
  427724,
  300087,
  627363,
  581799,
  253755,
  668128,
  339802,
  348200,
  663256,
  26083,
  505036,
  643628,
  582252,
  448116,
  37748,
  199262,
  235773,
  339124,
  140867,
  341513,
  68348,
  407828,
  209135,
  209993,
  493762,
  105140,
  487911,
  509998,
  531820,
  672550,
  27469,
  157055,
  232874,
  152422,
  75842,
  473514,
  519391,
  377243,
  224921,
  295250,
  446812,
  678762,
  351342,
  464051,
  246531,
  146989,
  117595,
  15318,
  205179,
  108004,
  645489,
  152475,
  302646,
  590012,
  95323,
  13198,
  343974,
  236393,
  333595,
  6546,
  88503,
  443914,
  459256,
 

In [44]:
q_result = s2v_results[56]['songs'][:100]
q_df = song_meta[song_meta['id'].isin(q_result)].set_index('id')
q_df.loc[q_result].drop_duplicates(['song_name'],keep='first')

Unnamed: 0_level_0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
144663,"[GN0105, GN0101]",20170324,밤편지,10047890,[261143],밤편지,[GN0100],[아이유]
116573,"[GN0501, GN0601, GN0503, GN0606, GN0509]",20111123,Lo9ve3r4s,2038488,[242988],안아줘,"[GN0500, GN0600]",[정준일]
357367,"[GN0401, GN0403]",20160621,비,2692501,[752425],비,[GN0400],[폴킴]
366786,"[GN0805, GN0509, GN0502, GN0801, GN0501]",20101007,가을방학,1035872,[437760],가끔 미치도록 네가 안고 싶어질 때가 있어,"[GN0500, GN0800]",[가을방학]
654757,"[GN1501, GN1504]",20041115,미안하다 사랑한다 OST,43841,[1191],눈의 꽃,[GN1500],[박효신]
...,...,...,...,...,...,...,...,...
6546,"[GN0105, GN1501, GN0101, GN1504]",20140902,연애의 발견 OST Part 4,2279896,[192827],"묘해, 너와","[GN1500, GN0100]",[어쿠스틱 콜라보]
88503,"[GN0104, GN1501, GN0101, GN1504]",20050120,쾌걸춘향 OST,48736,[164914],응급실,"[GN1500, GN0100]",[Izi]
443914,"[GN2503, GN0205, GN2501, GN2506, GN0201]",20150723,여자친구 2nd Mini Album `Flower Bud`,2330981,[828478],오늘부터 우리는 (Me Gustas Tu),"[GN2500, GN0200]",[여자친구 (GFRIEND)]
459256,[GN0901],20141110,Uptown Funk (Feat. Bruno Mars),2290446,[45077],Uptown Funk (Feat. Bruno Mars),[GN0900],[Mark Ronson]
