In [1]:
import os
import json
import pandas as pd
import seaborn as sns
import scipy
from scipy.io import mmwrite
from scipy import sparse
import numpy as np
import random
from matplotlib import pyplot as plt
from datetime import datetime
from tqdm import tqdm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.data import Dataset
%matplotlib inline

from IPython.display import Image

import warnings
warnings.filterwarnings("ignore")



In [2]:
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [3]:
song_meta = pd.read_json('D:/melon_playlist_continuation/res/song_meta.json', typ = 'frame')
song_meta

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4
...,...,...,...,...,...,...,...,...,...
707984,[GN2001],19991219,The Best Best Of The Black President,65254,[166499],Coffin For Head Of State,[GN2000],[Fela Kuti],707984
707985,[GN0901],19860000,True Colors,44141,[11837],Change Of Heart,[GN0900],[Cyndi Lauper],707985
707986,"[GN0105, GN0101]",20160120,행보 2015 윤종신 / 작사가 윤종신 Live Part.1,2662866,[437],스치듯 안녕,[GN0100],[윤종신],707986
707987,"[GN1807, GN1801]",20131217,명상의 시간을 위한 뉴에이지 음악,2221722,[729868],숲의 빛,[GN1800],[Nature Piano],707987


In [4]:
song_meta = song_meta[['id','song_gn_gnr_basket','artist_name_basket','issue_date']]
song_meta['genre'] = [', '.join(map(str, l)) for l in song_meta['song_gn_gnr_basket']]
song_meta['artist'] = [', '.join(map(str, l)) for l in song_meta['artist_name_basket']]
song_meta['issue_date'] = song_meta['issue_date'].apply(lambda x : str(x)[:4])
song_meta = song_meta[['id','genre']] 
song_meta

Unnamed: 0,id,genre
0,0,GN0900
1,1,GN1600
2,2,GN0900
3,3,GN1100
4,4,GN1800
...,...,...
707984,707984,GN2000
707985,707985,GN0900
707986,707986,GN0100
707987,707987,GN1800


In [6]:
meta = pd.read_csv('D:/melon_playlist_continuation/res/meta.csv')
meta

Unnamed: 0,song_id,gnr_code,gnr_name,artist_name_basket,song_name
0,0,GN0900,POP,['Various Artists'],Feelings
1,1,GN1600,클래식,['Murray Perahia'],"Bach : Partita No. 4 In D Major, BWV 828 - II...."
2,2,GN0900,POP,['Peter Gabriel'],Solsbury Hill (Remastered 2002)
3,3,GN1100,일렉트로니카,['Matoma'],Feeling Right (Everything Is Nice) (Feat. Popc...
4,4,GN1800,뉴에이지,['Jude Law'],그남자 그여자
...,...,...,...,...,...
802854,707984,GN2000,월드뮤직,['Fela Kuti'],Coffin For Head Of State
802855,707985,GN0900,POP,['Cyndi Lauper'],Change Of Heart
802856,707986,GN0100,발라드,['윤종신'],스치듯 안녕
802857,707987,GN1800,뉴에이지,['Nature Piano'],숲의 빛


In [7]:
genre_gn_all = pd.read_json('D:/melon_playlist_continuation/res/genre_gn_all.json', typ = 'series')
genre_gn_all

GN0100       발라드
GN0101    세부장르전체
GN0102       '80
GN0103       '90
GN0104       '00
           ...  
GN2900       뮤지컬
GN2901    세부장르전체
GN2902     국내뮤지컬
GN2903     국외뮤지컬
GN3000     크리스마스
Length: 254, dtype: object

In [8]:
# 장르코드 : gnr_code, 장르명 : gnr_name
genre_gn_all = pd.DataFrame(genre_gn_all, columns = ['gnr_name']).reset_index().rename(columns = {'index' : 'gnr_code'})
genre_gn_all

Unnamed: 0,gnr_code,gnr_name
0,GN0100,발라드
1,GN0101,세부장르전체
2,GN0102,'80
3,GN0103,'90
4,GN0104,'00
...,...,...
249,GN2900,뮤지컬
250,GN2901,세부장르전체
251,GN2902,국내뮤지컬
252,GN2903,국외뮤지컬


In [9]:
# 장르코드 뒷자리 두 자리가 00인 코드를 필터링
gnr_code = genre_gn_all[genre_gn_all['gnr_code'].str[-2:] == '00']
gnr_code

Unnamed: 0,gnr_code,gnr_name
0,GN0100,발라드
6,GN0200,댄스
12,GN0300,랩/힙합
18,GN0400,R&B/Soul
22,GN0500,인디음악
32,GN0600,록/메탈
39,GN0700,성인가요
50,GN0800,포크/블루스
56,GN0900,POP
65,GN1000,록/메탈


In [10]:
# 총 30개 대분류 장르 코드가 존재함
gnr_code['gnr_code'].nunique()

30

In [16]:
train = pd.read_json("D:/melon_playlist_continuation/arena_data/orig/train.json")
val_q_df = pd.read_json("D:/melon_playlist_continuation/arena_data/questions/val.json") # val 데이터의 각 플레이스트를 50%로 분할
val_a_df = pd.read_json("D:/melon_playlist_continuation/arena_data/answers/val.json") # 정답 데이터

In [17]:
train

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000
...,...,...,...,...,...,...
92051,"[90년생, 회상, 추억, 좋은노래, 80년생, 옛날노래]",149690,옛날노래 * 좋은노래 8090년생 노래 모음,"[292099, 513963, 174225, 287212, 140444, 62469...",155,2020-01-15 15:15:45.000
92052,[팝],35004,LOVE 1,"[62596, 359718, 596004, 668790, 291212, 148977...",8,2010-03-23 00:03:00.000
92053,"[여행, 발라드, 기분전환, 사랑]",59765,추억의 2004년 발라드 베스트,"[214372, 145150, 407082, 160552, 102445, 50845...",3,2019-05-15 13:26:07.000
92054,"[소울, 알앤비]",9867,All Music Guide 선정 90s R&B: 1997,"[561958, 397574, 250915, 110345, 426772, 10698...",51,2013-12-24 14:40:01.000


In [18]:
val_q_df

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[로맨틱, 기분전환, 보컬재즈]",75567,진한 초콜릿처럼 달콤한 재즈보컬,"[16641, 93976, 681892, 481989, 12662, 430071, ...",6,2019-03-18 13:56:41.000
1,[],47308,1년 내내 듣는 좋은 노래,[],5,2018-01-27 15:57:04.000
2,[],45679,모진이네 방(팝),"[690947, 156049, 18368, 135346]",5,2009-06-07 11:30:36.000
3,[],88612,처음부터 끝까지 다 좋아! 나만 알고싶은 팝송 노래들 :-),"[207390, 112997, 391057, 138483, 655745, 68463...",158,2019-09-21 14:22:34.000
4,"[CCM, withyou]",117860,WOMEN of 워십,"[544728, 572867, 228845, 48898, 364551, 226863...",8,2018-04-12 14:14:30.000
...,...,...,...,...,...,...
23010,[],132039,이것만 알아도 브릿팝 안다는 소리듣는다.,"[8778, 5725, 234307, 64368, 691112, 581349, 40...",232,2014-03-10 19:08:16.000
23011,[],21475,make015,"[470489, 72225, 387567, 328452, 517417, 690761...",0,2016-05-16 15:34:18.000
23012,"[힘들때, 지칠때, 힘내]",81196,"힘든 걸 알아, 말해주고 싶었어요","[448250, 131741, 531820, 616260, 639943, 13129...",13,2017-12-20 16:32:07.000
23013,[],151600,♥Christmas Music♥,[],8,2015-12-23 09:38:19.000


In [19]:
# query 안에 곡 리스트가 없는 플레이리스트 개수
len(val_q_df[[val_q_df['songs'][i]==[] for i in range(len(val_q_df))]])

4606

In [20]:
val_a_df

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[카페음악, 재즈, 보컬]",75567,진한 초콜릿처럼 달콤한 재즈보컬,"[502397, 568603, 197626, 345555, 68157, 116877...",6,2019-03-18 13:56:41.000
1,"[브로콜리너마저리, 주말, 재즈, 드라이브브, 팝송, 차분한분, 조용한한, 인디]",47308,1년 내내 듣는 좋은 노래,"[194051, 511258, 595181, 204818, 91059, 453055...",5,2018-01-27 15:57:04.000
2,[팝],45679,모진이네 방(팝),"[379787, 677020, 337541, 247563]",5,2009-06-07 11:30:36.000
3,"[감성, 잔잔한, 드라이브, 여행, 유명한, 휴식, 기분전환, 사랑, 팝송, 신나는]",88612,처음부터 끝까지 다 좋아! 나만 알고싶은 팝송 노래들 :-),"[542751, 488425, 1839, 447058, 688285, 154095,...",158,2019-09-21 14:22:34.000
4,"[워십, 여성]",117860,WOMEN of 워십,"[188132, 304340, 567886, 367551, 641531, 50647...",8,2018-04-12 14:14:30.000
...,...,...,...,...,...,...
23010,[락],132039,이것만 알아도 브릿팝 안다는 소리듣는다.,"[637170, 217266, 131578, 681882, 560854, 69906...",232,2014-03-10 19:08:16.000
23011,[이승기],21475,make015,"[241087, 370470, 315382, 184658, 27758, 538124...",0,2016-05-16 15:34:18.000
23012,"[다시일어나요, 슬픔, 12월]",81196,"힘든 걸 알아, 말해주고 싶었어요","[267760, 207672, 196295, 683700, 77547, 471716...",13,2017-12-20 16:32:07.000
23013,"[겨울, 설렘, 사랑]",151600,♥Christmas Music♥,"[3559, 688721, 74901, 54047, 94991, 424136, 17...",8,2015-12-23 09:38:19.000


In [21]:
len(val_a_df.iloc[3]['songs'])

27

In [22]:
def plylist_song_map(df, col, col1):
    # 플레이리스트 아이디(id)와 수록곡(songs) 추출
    plylst_song_map = df[[col, col1]]

    # unnest songs
    plylst_song_map_unnest = np.dstack(
        (
            np.repeat(plylst_song_map.id.values, list(map(len, plylst_song_map.songs))), 
            np.concatenate(plylst_song_map.songs.values)
        )
    )

    # unnested 데이터프레임 생성 : plylst_song_map
    plylst_song_map = pd.DataFrame(data = plylst_song_map_unnest[0], columns = plylst_song_map.columns)
    plylst_song_map[col] = plylst_song_map[col].astype('int')
    plylst_song_map[col1] = plylst_song_map[col1].astype('int')

    # unnest 객체 제거
    del plylst_song_map_unnest
    return plylst_song_map

In [23]:
train_df = pd.concat([train, val_q_df])
train_df

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000
...,...,...,...,...,...,...
23010,[],132039,이것만 알아도 브릿팝 안다는 소리듣는다.,"[8778, 5725, 234307, 64368, 691112, 581349, 40...",232,2014-03-10 19:08:16.000
23011,[],21475,make015,"[470489, 72225, 387567, 328452, 517417, 690761...",0,2016-05-16 15:34:18.000
23012,"[힘들때, 지칠때, 힘내]",81196,"힘든 걸 알아, 말해주고 싶었어요","[448250, 131741, 531820, 616260, 639943, 13129...",13,2017-12-20 16:32:07.000
23013,[],151600,♥Christmas Music♥,[],8,2015-12-23 09:38:19.000


In [24]:
train_df = plylist_song_map(train_df, 'id','songs')
train_df

Unnamed: 0,id,songs
0,147668,663185
1,147668,649626
2,147668,6855
3,147668,188486
4,147668,348451
...,...,...
4653524,81196,22991
4653525,7706,243082
4653526,7706,383812
4653527,7706,218279


## 전처리

In [25]:
play_source = [(train_df['id'][i], train_df['songs'][i]) for i in range(train_df.shape[0])]
play_source

[(147668, 663185),
 (147668, 649626),
 (147668, 6855),
 (147668, 188486),
 (147668, 348451),
 (147668, 169945),
 (147668, 512599),
 (147668, 532114),
 (147668, 454528),
 (147668, 418935),
 (147668, 124485),
 (147668, 517372),
 (147668, 549950),
 (147668, 540588),
 (147668, 500931),
 (147668, 233641),
 (147668, 331055),
 (147668, 490266),
 (147668, 268515),
 (147668, 531820),
 (147668, 413762),
 (147668, 422713),
 (147668, 215080),
 (147668, 413189),
 (147668, 577903),
 (147668, 352228),
 (147668, 630395),
 (147668, 539109),
 (147668, 152475),
 (147668, 111865),
 (147668, 7460),
 (147668, 72432),
 (147668, 572480),
 (147668, 348092),
 (147668, 324208),
 (147668, 186039),
 (147668, 376140),
 (147668, 270269),
 (147668, 622615),
 (147668, 35001),
 (147668, 444706),
 (147668, 491303),
 (147668, 408698),
 (147668, 325979),
 (147668, 25538),
 (147668, 549392),
 (147668, 473514),
 (147668, 666814),
 (147668, 118223),
 (147668, 697100),
 (147668, 333034),
 (147668, 359279),
 (147668, 421124),


In [26]:
song_meta

Unnamed: 0,id,genre
0,0,GN0900
1,1,GN1600
2,2,GN0900
3,3,GN1100
4,4,GN1800
...,...,...
707984,707984,GN2000
707985,707985,GN0900
707986,707986,GN0100
707987,707987,GN1800


In [27]:
                        # 개별 곡에 대한 id
song_features_source = [(song_meta['id'][i],
                        # 여기서부터 곡별 feature
                        [song_meta['genre'][i]]) for i in range(song_meta.shape[0])]
                         # song_meta['issue_date'][i]]) for i in range(song_meta.shape[0])]
song_features_source

[(0, ['GN0900']),
 (1, ['GN1600']),
 (2, ['GN0900']),
 (3, ['GN1100']),
 (4, ['GN1800']),
 (5, ['GN1700']),
 (6, ['GN1600']),
 (7, ['GN1600']),
 (8, ['GN0300']),
 (9, ['GN0100']),
 (10, ['GN1200']),
 (11, ['GN1300']),
 (12, ['GN1700']),
 (13, ['GN1700']),
 (14, ['GN2700, GN1100']),
 (15, ['GN1600']),
 (16, ['GN0900']),
 (17, ['GN2500, GN0200']),
 (18, ['GN2800']),
 (19, ['GN0500, GN0800']),
 (20, ['GN1600']),
 (21, ['']),
 (22, ['GN0900']),
 (23, ['GN1600']),
 (24, ['GN1100']),
 (25, ['GN0300']),
 (26, ['GN0900']),
 (27, ['GN1900']),
 (28, ['GN0900']),
 (29, ['GN1500']),
 (30, ['GN2500, GN1500, GN0200']),
 (31, ['GN1800']),
 (32, ['GN1500']),
 (33, ['GN2600']),
 (34, ['GN1100']),
 (35, ['GN0500, GN0800']),
 (36, ['GN1500']),
 (37, ['GN1700']),
 (38, ['GN1700']),
 (39, ['GN1300']),
 (40, ['GN1000']),
 (41, ['GN1700']),
 (42, ['GN1000']),
 (43, ['GN1400']),
 (44, ['GN1000']),
 (45, ['GN2000']),
 (46, ['GN1000']),
 (47, ['GN0500, GN0300']),
 (48, ['GN0800']),
 (49, ['GN0100']),
 (50, ['GN

In [28]:
song_meta[song_meta.columns[1:]].values.flatten()

array(['GN0900', 'GN1600', 'GN0900', ..., 'GN0100', 'GN1800', 'GN0600'],
      dtype=object)

In [29]:
song_meta

Unnamed: 0,id,genre
0,0,GN0900
1,1,GN1600
2,2,GN0900
3,3,GN1100
4,4,GN1800
...,...,...
707984,707984,GN2000
707985,707985,GN0900
707986,707986,GN0100
707987,707987,GN1800


In [30]:
# Fit the user/item id and feature name mappings.
dataset = Dataset()
            # playlist id
dataset.fit(users=train_df['id'].unique(),
            # 노래 아이디
            items=song_meta['id'].unique(),
            item_features=song_meta[song_meta.columns[1:]].values.flatten()
            )

In [31]:
song_meta[song_meta.columns[1:]].values.flatten()

array(['GN0900', 'GN1600', 'GN0900', ..., 'GN0100', 'GN1800', 'GN0600'],
      dtype=object)

In [32]:
# Build an interaction matrix.
interactions, weights = dataset.build_interactions(play_source)
# Build a item features matrix out of an iterable of the form (item id, [list of feature names]) 
# or (item id, {feature name: feature weight}).
song_features = dataset.build_item_features(song_features_source)

In [33]:
# 115071(총 플레이리스트 개수) - 4606(곡 리스트가 없는 플레이리스트 개수)
interactions

<110465x707989 sparse matrix of type '<class 'numpy.int32'>'
	with 4653529 stored elements in COOrdinate format>

In [34]:
weights

<110465x707989 sparse matrix of type '<class 'numpy.float32'>'
	with 4653529 stored elements in COOrdinate format>

In [35]:
# song_meta에 있는 곡 개수 * feature 개수
song_features

<707989x708211 sparse matrix of type '<class 'numpy.float32'>'
	with 1415978 stored elements in Compressed Sparse Row format>

In [36]:
original_id2map = dataset.mapping()[0]
original_id2map

{147668: 0,
 50422: 1,
 116432: 2,
 55076: 3,
 125064: 4,
 5747: 5,
 95441: 6,
 36913: 7,
 60923: 8,
 41197: 9,
 43789: 10,
 23330: 11,
 28845: 12,
 3019: 13,
 34706: 14,
 20353: 15,
 97508: 16,
 20064: 17,
 89500: 18,
 35350: 19,
 6004: 20,
 109952: 21,
 89216: 22,
 43481: 23,
 124000: 24,
 38271: 25,
 42681: 26,
 42347: 27,
 101751: 28,
 94448: 29,
 89484: 30,
 102414: 31,
 61915: 32,
 44216: 33,
 8568: 34,
 76132: 35,
 49418: 36,
 67989: 37,
 122330: 38,
 132109: 39,
 8548: 40,
 23969: 41,
 139412: 42,
 95604: 43,
 11062: 44,
 130384: 45,
 65093: 46,
 105805: 47,
 21342: 48,
 117238: 49,
 77730: 50,
 150859: 51,
 52277: 52,
 127931: 53,
 32325: 54,
 74: 55,
 146788: 56,
 111144: 57,
 74414: 58,
 10777: 59,
 104242: 60,
 27544: 61,
 23100: 62,
 63949: 63,
 51971: 64,
 65928: 65,
 51847: 66,
 75485: 67,
 100754: 68,
 84879: 69,
 14509: 70,
 51386: 71,
 6289: 72,
 49028: 73,
 95226: 74,
 96578: 75,
 17421: 76,
 76748: 77,
 54721: 78,
 54898: 79,
 13673: 80,
 87046: 81,
 133367: 82,
 12

In [37]:
song_id2map = dataset.mapping()[2]
song_id2map

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 102: 102,
 103: 103,
 104: 104,
 105: 105,
 106: 106,
 107: 107,
 108: 108,
 109: 109,
 110: 110,

In [38]:
# Save
# mmwrite('interactions(장르)+POP.mtx', interactions)
# mmwrite('item_features(장르)+POP.mtx', song_features)
# mmwrite('weights(장르)+POP.mtx', weights)

## 모델 학습

In [36]:
# Set the number of threads; you can increase this
# if you have more physical cores available.
NUM_THREADS = 1000
NUM_COMPONENTS = 100
NUM_EPOCHS = 200
ITEM_ALPHA = 1e-6

# Let's fit a WARP model: these generally have the best performance.
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

model = model.fit(interactions,
                  item_features=song_features,
                  # sample_weight=weights,
                  epochs=NUM_EPOCHS,
                  num_threads=NUM_THREADS,
                  verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████| 200/200 [1:26:15<00:00, 25.88s/it]


## 개별 추천

In [46]:
plylist_mapid = original_id2map[88612]
plylist_mapid

92058

In [47]:
def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]


def sample_recommendation(model, interactions, user_id, top_k):
    
    labels = song_meta.id.values
    n_users, n_items = interactions.shape
    plylist_mapid = original_id2map[user_id]
    scores = model.predict(plylist_mapid, np.arange(n_items),song_features) #,user_features)
    rec_result = labels[np.argsort(-scores)]
    rec_result = remove_seen(val_q_df[val_q_df['id']==user_id]['songs'].values[0],rec_result)[:top_k]
    
    return rec_result

In [48]:
sample_reclist = sample_recommendation(model, interactions, 88612, 100)
sample_reclist

[675593,
 175720,
 279922,
 23590,
 4984,
 553846,
 381874,
 432499,
 318046,
 72002,
 113160,
 628572,
 111563,
 643692,
 430508,
 14637,
 460823,
 87803,
 678161,
 481160,
 507711,
 517472,
 258646,
 668512,
 662266,
 677277,
 24553,
 595295,
 316305,
 135506,
 333027,
 415210,
 553430,
 85683,
 567438,
 168867,
 81523,
 632699,
 232263,
 450760,
 477989,
 645498,
 248095,
 457552,
 80801,
 353562,
 418694,
 225461,
 451803,
 635425,
 168031,
 435501,
 635450,
 146584,
 652045,
 208345,
 596058,
 213364,
 411150,
 99193,
 74920,
 98832,
 457390,
 581888,
 339843,
 580349,
 239673,
 656843,
 191620,
 390934,
 294743,
 106500,
 532600,
 124705,
 108013,
 8749,
 56074,
 145026,
 295130,
 143345,
 369439,
 427174,
 511562,
 572392,
 429377,
 106559,
 415389,
 343835,
 168528,
 227306,
 217091,
 274556,
 477206,
 397222,
 488946,
 535791,
 449321,
 66416,
 511298,
 146668]

In [49]:
# test 88612 playlist id 입력 데이터
q_df = meta[meta['song_id'].isin(val_q_df.iloc[3]['songs'])].set_index('song_id')
q_df.loc[val_q_df.iloc[3]['songs']].drop_duplicates(['song_name'],keep='first')

Unnamed: 0_level_0,gnr_code,gnr_name,artist_name_basket,song_name
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
207390,GN0900,POP,['Lana Del Rey'],Summertime Sadness
112997,GN0900,POP,['Ed Sheeran'],Supermarket Flowers
391057,GN0900,POP,['Quinn XCII'],Always Been You
138483,GN1300,R&B/Soul,['NIKI'],I Like U
655745,GN0900,POP,['Baby Ariel'],Perf
684639,GN1200,랩/힙합,['XXXTENTACION'],Moonlight
350034,GN0900,POP,['Bulow'],You & Jennifer
331171,GN1300,R&B/Soul,"['88rising', 'Joji', 'BlocBoy JB']",Peach Jam
23381,GN1300,R&B/Soul,"['88rising', 'Rich Brian']",History
391137,GN1300,R&B/Soul,"['88rising', 'AUGUST 08', 'NIKI']",Poolside Manor


In [50]:
# 추천 결과
rec_df = meta[meta['song_id'].isin(sample_reclist)].set_index('song_id')
rec_df.loc[sample_reclist].drop_duplicates(['song_name'],keep='first')

Unnamed: 0_level_0,gnr_code,gnr_name,artist_name_basket,song_name
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
675593,GN1300,R&B/Soul,"['Khalid', 'Disclosure']",Talk
175720,GN0900,POP,"['Lauv', 'Troye Sivan']",i`m so tired...
279922,GN1100,일렉트로니카,['HONNE'],Day 1 ◑
23590,GN1300,R&B/Soul,['Kiana Lede'],EX
4984,GN1300,R&B/Soul,['Pink Sweat$'],Honesty
...,...,...,...,...
535791,GN0900,POP,['The 1975'],Sincerity Is Scary
449321,GN0900,POP,['Coldplay'],Orphans
66416,GN0900,POP,['Abir'],Tango
511298,GN1300,R&B/Soul,['The Weeknd'],Blinding Lights


In [51]:
# test 정답 데이터
a_df = meta[meta['song_id'].isin(val_a_df.iloc[3]['songs'])].set_index('song_id')
a_df.loc[val_a_df.iloc[3]['songs']].drop_duplicates(['song_name'],keep='first')

Unnamed: 0_level_0,gnr_code,gnr_name,artist_name_basket,song_name
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
542751,GN1300,R&B/Soul,['Kehlani'],Jealous (Feat. Lexii Alijai)
488425,GN0900,POP,['Selena Gomez'],Sober
1839,GN0900,POP,['Birdy'],Keeping Your Head Up
447058,GN1300,R&B/Soul,['Kehlani'],Personal
688285,GN1300,R&B/Soul,['Kehlani'],Honey
154095,GN0900,POP,"['Selena Gomez', 'Marshmello']",Wolves
599409,GN0900,POP,['Freya Ridings'],Lost Without You
149760,GN1300,R&B/Soul,"['SZA', 'Calvin Harris']",The Weekend (Funk Wav Remix)
315393,GN1200,랩/힙합,['XXXTENTACION'],SAD!
596055,GN0900,POP,"['Vax', 'Sorana']",Bubble Gum


In [52]:
rec_dict = {}
rec_dict[val_a_df.iloc[3].id] = sample_reclist
rec_dict

{88612: [675593,
  175720,
  279922,
  23590,
  4984,
  553846,
  381874,
  432499,
  318046,
  72002,
  113160,
  628572,
  111563,
  643692,
  430508,
  14637,
  460823,
  87803,
  678161,
  481160,
  507711,
  517472,
  258646,
  668512,
  662266,
  677277,
  24553,
  595295,
  316305,
  135506,
  333027,
  415210,
  553430,
  85683,
  567438,
  168867,
  81523,
  632699,
  232263,
  450760,
  477989,
  645498,
  248095,
  457552,
  80801,
  353562,
  418694,
  225461,
  451803,
  635425,
  168031,
  435501,
  635450,
  146584,
  652045,
  208345,
  596058,
  213364,
  411150,
  99193,
  74920,
  98832,
  457390,
  581888,
  339843,
  580349,
  239673,
  656843,
  191620,
  390934,
  294743,
  106500,
  532600,
  124705,
  108013,
  8749,
  56074,
  145026,
  295130,
  143345,
  369439,
  427174,
  511562,
  572392,
  429377,
  106559,
  415389,
  343835,
  168528,
  227306,
  217091,
  274556,
  477206,
  397222,
  488946,
  535791,
  449321,
  66416,
  511298,
  146668]}

In [53]:
answer = {}
answer[val_a_df.iloc[3].id] = val_a_df.iloc[3].songs
answer

{88612: [542751,
  488425,
  1839,
  447058,
  688285,
  154095,
  599409,
  149760,
  315393,
  596055,
  23994,
  660956,
  17002,
  25696,
  233461,
  66416,
  532346,
  675773,
  339284,
  67521,
  78958,
  80801,
  639505,
  353562,
  4984,
  4706,
  510395]}

In [54]:
evaluate_func = evaluate(recs=rec_dict, gt = answer)
evaluate_func._evaluate()

MAP@100: 0.012980672677014983
NDCG@100: 0.10511829972969745
EntDiv@100: 4.605170185988082


## 전체 추천

In [55]:
def df2dict(df):
    dict_ = {}
    for i in df['id']:
        for j in df[df['id']==i].songs:
            dict_[i] = j
    return dict_

In [56]:
pop_rec = pd.read_json("pop_result.json")
pop_rec

Unnamed: 0,id,songs,tags
0,14464,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"
1,77691,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"
2,13968,"[116573, 357367, 366786, 654757, 133143, 67511...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"
3,87849,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"
4,18778,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"
...,...,...,...
23010,150006,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 드라이브, 힐링, 사랑, 새벽, 밤, 카페]"
23011,22484,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"
23012,139148,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"
23013,60897,"[144663, 116573, 357367, 366786, 654757, 13314...","[기분전환, 감성, 휴식, 발라드, 잔잔한, 드라이브, 힐링, 사랑, 새벽, 밤]"


In [57]:
# pop_rec dictionary 변환
pop_dict = df2dict(pop_rec)
pop_dict

{14464: [144663,
  116573,
  357367,
  366786,
  654757,
  133143,
  675115,
  349492,
  463173,
  396828,
  42155,
  461341,
  174749,
  701557,
  610933,
  520093,
  13281,
  418935,
  449244,
  650494,
  680366,
  485155,
  549178,
  11657,
  169984,
  523521,
  648628,
  422915,
  187047,
  547967,
  422077,
  625875,
  350309,
  215411,
  442014,
  132994,
  427724,
  300087,
  627363,
  581799,
  253755,
  668128,
  339802,
  348200,
  663256,
  26083,
  505036,
  643628,
  582252,
  448116,
  37748,
  199262,
  235773,
  339124,
  140867,
  341513,
  68348,
  407828,
  209135,
  209993,
  493762,
  105140,
  487911,
  509998,
  531820,
  672550,
  27469,
  157055,
  232874,
  152422,
  75842,
  473514,
  519391,
  377243,
  224921,
  295250,
  446812,
  678762,
  351342,
  464051,
  246531,
  146989,
  117595,
  15318,
  205179,
  108004,
  645489,
  152475,
  302646,
  590012,
  95323,
  13198,
  343974,
  236393,
  333595,
  6546,
  88503,
  443914,
  459256,
  640657],
 77691

In [58]:
# 정답 데이터 딕셔너리 변환
# 정답 dictionary
answer_dict = df2dict(val_a_df)
answer_dict

{75567: [502397,
  568603,
  197626,
  345555,
  68157,
  116877,
  280153,
  565075,
  294480,
  115170],
 47308: [194051,
  511258,
  595181,
  204818,
  91059,
  453055,
  248043,
  169945,
  512599,
  61159,
  555305,
  445984,
  5970,
  534818,
  339802,
  74131,
  357510,
  478754,
  555338,
  376360,
  75842,
  368069,
  669120,
  621690,
  667394,
  545089,
  134523,
  630395,
  347038,
  624607,
  705445,
  75971,
  520755,
  316742],
 45679: [379787, 677020, 337541, 247563],
 88612: [542751,
  488425,
  1839,
  447058,
  688285,
  154095,
  599409,
  149760,
  315393,
  596055,
  23994,
  660956,
  17002,
  25696,
  233461,
  66416,
  532346,
  675773,
  339284,
  67521,
  78958,
  80801,
  639505,
  353562,
  4984,
  4706,
  510395],
 117860: [188132,
  304340,
  567886,
  367551,
  641531,
  506478,
  57784,
  194226,
  166707,
  163974,
  262651,
  513572,
  679061,
  428990,
  61972,
  650100,
  270141,
  345455,
  518555,
  150249,
  27756],
 135083: [318019,
  587299,
 

In [59]:
# test input dictionary
q_dict = df2dict(val_q_df)
q_dict

{75567: [16641,
  93976,
  681892,
  481989,
  12662,
  430071,
  104363,
  158296,
  4298,
  642204],
 47308: [],
 45679: [690947, 156049, 18368, 135346],
 88612: [207390,
  112997,
  391057,
  138483,
  655745,
  684639,
  350034,
  331171,
  23381,
  391137,
  541764,
  121586,
  323722,
  185196,
  233927,
  552464,
  243579,
  680360,
  477501,
  506712,
  177414,
  256658,
  487867,
  67371,
  426365,
  47202],
 117860: [544728,
  572867,
  228845,
  48898,
  364551,
  226863,
  525212,
  452533,
  12917,
  557720,
  695964,
  500767,
  654750,
  85940,
  648201,
  508317,
  345689,
  417889,
  543697,
  620246,
  184399],
 135083: [284738,
  449446,
  24087,
  191915,
  231320,
  575649,
  579817,
  257119,
  581276,
  75469,
  69648],
 122531: [],
 141475: [],
 7547: [80656,
  487721,
  243178,
  127008,
  282125,
  355223,
  26083,
  130199,
  236393,
  164508,
  527478,
  704752,
  77781,
  352879,
  201145,
  205065,
  462965,
  523439,
  367459,
  201366,
  412767,
  344327

In [60]:
def recommend(q_dict, model, interactions, answer_dict, top_k):
    total_rec_dict = {}
    cold_start = 0
    labels = song_meta.id.values
    n_users, n_items = interactions.shape
    for k in tqdm(q_dict.keys()):
        if k in original_id2map:
            plylist_id = original_id2map[k]
            scores = model.predict(plylist_id, np.arange(n_items), song_features, num_threads=5000) #,user_features)
            rec_result = labels[np.argsort(-scores)]
            rec_result = remove_seen(val_q_df[val_q_df['id']==k]['songs'].values[0],rec_result)[:top_k]
            
        # 플레이리스트 안에 구성 곡이 없는 경우 popularity기반 추천
        else:
            cold_start += 1
            rec_result = pop_dict.get(k)
        total_rec_dict[k] = rec_result
    evaluate_func = evaluate(recs=total_rec_dict, gt = answer_dict)
    print(evaluate_func._evaluate())
    print('cold_start_user: ', cold_start)
    return total_rec_dict

In [61]:
total_rec_dict = recommend(q_dict, model, interactions, answer_dict, 100)

100%|██████████████████████████████████████████████████████████████████████████| 23015/23015 [1:16:08<00:00,  5.04it/s]


MAP@100: 0.0423412181797174
NDCG@100: 0.12011680609620567
EntDiv@100: 9.22052824557922
None
cold_start_user:  4606


In [62]:
import pickle

# save data
with open('LightFM(장르) + POP 추천.pickle','wb') as fw:
    pickle.dump(total_rec_dict, fw)

In [63]:
# load data
with open('LightFM(장르) + POP 추천.pickle', 'rb') as fr:
    user_loaded = pickle.load(fr)

In [56]:
# 모델 저장
with open('LightFM(장르) + POP rec_model.pickle', 'wb') as fle:
    pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)