<a href="https://colab.research.google.com/github/ivoryRabbit/kakao_arena/blob/master/2_Preprocessing(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

my_path = '/content/notebooks'
try: os.symlink('/content/drive/My Drive/Colab Notebooks/my_env', my_path)
except: print('\nAlready linked...')
sys.path.insert(0, my_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Already linked...


In [None]:
import re, glob, copy
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from khaiii import KhaiiiApi
import sentencepiece as spm

In [None]:
display(glob.glob('drive/My Drive/kakao_arena/data/*'))
base_path = 'drive/My Drive/kakao_arena/'

['drive/My Drive/kakao_arena/data/val.json',
 'drive/My Drive/kakao_arena/data/test.json',
 'drive/My Drive/kakao_arena/data/genre_gn_all.json',
 'drive/My Drive/kakao_arena/data/train.json',
 'drive/My Drive/kakao_arena/data/song_meta.json',
 'drive/My Drive/kakao_arena/data/genre_v1.json',
 'drive/My Drive/kakao_arena/data/test_v1.json',
 'drive/My Drive/kakao_arena/data/val_v1.json',
 'drive/My Drive/kakao_arena/data/train_v1.json',
 'drive/My Drive/kakao_arena/data/song_meta_v1.json',
 'drive/My Drive/kakao_arena/data/val_v2.json',
 'drive/My Drive/kakao_arena/data/test_v2.json',
 'drive/My Drive/kakao_arena/data/train_v2.json',
 'drive/My Drive/kakao_arena/data/song_meta_v2.json']

In [None]:
train = pd.read_json(base_path + 'data/train.json')
valid = pd.read_json(base_path + 'data/val.json')
test = pd.read_json(base_path + 'data/test.json')
genre = pd.read_json(base_path + 'data/genre_gn_all.json', typ = 'series')
genre = pd.DataFrame(genre, columns = ['gnr_name']).reset_index().rename(columns = {'index' : 'gnr_code'})
song_meta = pd.read_json(base_path + 'data/song_meta.json', typ = 'frame')

### 1.1 플레이리스트 업데이트 날짜 전처리

In [None]:
def date_format(df):
  return df.assign(updt_date = lambda x: pd.to_datetime(x.updt_date).dt.strftime('%Y%m%d').astype(int))

In [None]:
train, valid, test = map(date_format, (train, valid, test))

In [None]:
train.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,20131219
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,20141202
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,20170828


### 1.2 플레이리스트 자연어 처리

In [None]:
def word_ppc(s):
  if s is None:
    return None
  s = s.str.lower() # 영어 소문자로
  s = s.str.replace(pat = r'[ㄱ-ㅎ]', repl = r' ', regex = True) # 단자음 제거
  s = s.str.replace(pat = r'[^\w\s]', repl = r' ', regex = True) # 특수문자 제거
  s = s.str.replace(pat = r'[_]', repl = r' ', regex = True) # 언더바 제거
  s = s.str.replace(pat = r'[ ]{2,}', repl = r' ', regex = True) # 공백 제거
  s = s.str.strip()
  return s

In [None]:
train, valid, test = map(lambda df: df.assign(plylst_title = lambda x: word_ppc(x.plylst_title)), (train, valid, test))
train.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,20131219
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,20141202
2,"[까페, 잔잔한]",76951,편하게 잔잔하게 들을 수 있는 곡,"[83116, 276692, 166267, 186301, 354465, 256598...",17,20170828


In [None]:
train.to_json('drive/My Drive/kakao_arena/data/train_v1.json', orient = 'records', force_ascii = False)
valid.to_json('drive/My Drive/kakao_arena/data/val_v1.json', orient = 'records', force_ascii = False)
test.to_json('drive/My Drive/kakao_arena/data/test_v1.json', orient = 'records', force_ascii = False)

### 2.1 장르 국내/국외

In [None]:
def add_oversea(df):
  def is_oversea(df):
    if df.gnr_code[2:4] in ['01', '02', '03', '04', '05', '06', '07', '08', '24', '25', '26']: return '국내'
    elif df.gnr_code[2:4] in ['09', '10', '11', '12', '13', '14', '19', '20']: return '국외'
    if '국내' in df.gnr_name: return '국내'
    elif '국외' in df.gnr_name: return '국외'
    return ''
  return df.assign(gnr_group = lambda x: x.apply(is_oversea, axis = 1))

def remove_oversea(df):
  return df.assign(gnr_name = lambda x: x.gnr_name.str.replace('국내|국외', ''))

In [None]:
genre = add_oversea(genre)
genre = remove_oversea(genre)

### 2.2 장르 자연어 처리

In [None]:
def gnr_ppc(df):
    df['gnr_name'] = df.gnr_name.str.lower().str.replace(pat = r"[/|'|-]", repl = r' ', regex = True)

    name_repl_idx = df[df.gnr_name == '세부장르전체'].index
    name_repl_to = df.loc[name_repl_idx - 1, 'gnr_name'].values
    df.loc[name_repl_idx, 'gnr_name'] = name_repl_to

    df['gnr_name'] = df.gnr_name.str.replace('pop', '팝')
    df['gnr_name'] = df.gnr_name.str.replace('newage', '뉴에이지')
    df['gnr_name'] = df.gnr_name.str.replace('jazz', '재즈')
    df['gnr_name'] = df.gnr_name.str.replace('릴렉싱&힐링', '릴렉싱 힐링')

    for gnr in ['동요', '동화', '팝', '록', '메탈', '랩', '트로트', '가요', '재즈', '클래식', '힙합']:
        df['gnr_name'] = df.gnr_name.str.replace(gnr, ' ' + gnr + ' ')
    for gnr in ['50', '60', '70', '80', '90', '00', '10']:
        df['gnr_name'] = df.gnr_name.str.replace(gnr, gnr + '년대')
    df['gnr_name'] = df.gnr_name.str.strip()
    return df

def gnr_tokenizing(df):
    genre_names = df.gnr_name.to_list()
    with open('genre_names.txt', 'w', encoding = 'utf-8') as f:
        for name in genre_names:
            f.write(name + '\n')
    spm.SentencePieceTrainer.Train('--input=genre_names.txt --model_prefix=genre --vocab_size=150 --model_type=word')
    spp = spm.SentencePieceProcessor(model_file = 'genre.model')
    tokens = [spp.EncodeAsPieces(name) for name in tqdm(genre_names)]
    df['gnr_basket'] = [[w.replace('▁', '') for w in token] for token in tokens]
    return df

In [None]:
genre = gnr_ppc(genre)
genre = gnr_tokenizing(genre)
genre = genre.assign(gnr_basket = lambda x: x.apply(lambda y: y.gnr_basket + [y.gnr_group] if y.gnr_group != '' else y.gnr_basket, axis = 1))
genre.head(3)

HBox(children=(FloatProgress(value=0.0, max=254.0), HTML(value='')))




Unnamed: 0,gnr_code,gnr_name,gnr_group,gnr_basket
0,GN0100,발라드,국내,"[발라드, 국내]"
1,GN0101,발라드,국내,"[발라드, 국내]"
2,GN0102,80년대,국내,"[80년대, 국내]"


In [None]:
genre.to_json('drive/My Drive/kakao_arena/data/genre_v1.json', orient = 'records', force_ascii = False)

### 3.1 곡 메타데이터 날짜 전처리

- 곡 발매 일자보다 플레이리스트 업데이트 일자가 빠른 경우
- 곡 발매 일자가 0인 경우
- 전처리 후에도 곡 발매 일자가 여전히 0인 경우가 있는데, 이들 모두 플레이리스트에 등장하지 않는 곡들이므로 더이상 고려하지 않음

In [None]:
song_meta = pd.read_json(base_path + 'data/song_meta.json', typ = 'frame')

In [None]:
def replace_date(df):
  song_updt_date = train[['songs', 'updt_date']].explode('songs').groupby('songs')['updt_date'].agg('min')
  anormal_date_idx = df.join(song_updt_date).query('issue_date > updt_date | issue_date == 0').dropna().index
  df.loc[anormal_date_idx, 'issue_date'] = song_updt_date.loc[anormal_date_idx]

  album_date = df.groupby('album_id')['issue_date'].agg(lambda x: 0 if max(x) == 0 else min([y for y in x if y != 0]))
  album_date_dic = album_date.to_dict()
  df['issue_date'] = df.album_id.map(album_date_dic)
  df.loc[df.issue_date < 19400000, 'issue_date'] = 19400000
  return df

In [None]:
song_meta = replace_date(song_meta)
song_meta.head(3)

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2


### 3.2 곡 메타데이터 장르 전처리

- song_gn_gnr_basket에서 비정상적인 코드 식별
  - 코드: GN9000
  - 특징: 
    - 다른 장르코드와 함께 있지는 않음
    - 다양한 장르의 곡에 걸쳐서 나타남
- song_gn_dtl_gnr_basket에는 문제 없음

In [None]:
genre_in_meta = song_meta.song_gn_gnr_basket.explode().unique()
genre_in_meta[~np.isin(genre_in_meta, genre.gnr_code.values)]

array([nan, 'GN9000'], dtype=object)

In [None]:
anormal_gnr_idx = song_meta[song_meta.song_gn_gnr_basket.astype('str').str.contains('GN9000')].index
song_meta.loc[anormal_gnr_idx].head(3)

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
262,[],20150318,노년의 건강을 지켜주는 음악 처방전 (부제 : 휴식 때 들으면 좋은 음악친구),2309524,[726909],Bio Sound `Breathing & Big Bell`,[GN9000],[차병원],262
916,[],20170825,크리스마스의 기적,10090652,[750416],Springtime In December,[GN9000],[Larry Warren],916
1074,[],20131128,Open It Up,10008901,[1221466],Open It Up,[GN9000],[Tchengiz],1074


In [None]:
song_meta.loc[anormal_gnr_idx, 'song_gn_gnr_basket'] = song_meta.loc[anormal_gnr_idx, 'song_gn_gnr_basket'].apply(lambda x: [])

empty_gnr_idx = song_meta[song_meta.song_gn_gnr_basket.apply(len) == 0].index
empty_gnr = song_meta.loc[empty_gnr_idx]

print(empty_gnr.shape)
empty_gnr.head(3)

(2893, 9)


Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
21,[],20200331,WHY,10410508,[417985],WHY,[],[4minute],21
262,[],20150318,노년의 건강을 지켜주는 음악 처방전 (부제 : 휴식 때 들으면 좋은 음악친구),2309524,[726909],Bio Sound `Breathing & Big Bell`,[],[차병원],262
399,[],20200106,뽀로로 겨울 동요,10372098,[1703695],초코 초코 핫초코,[],[아이코닉스 (ICONIX)],399


In [None]:
extract_words = lambda df: df.album_name + ' ' + df.song_name + ' ' + df.artist_name_basket.apply(' '.join)
words = extract_words(empty_gnr)

In [None]:
words = word_ppc(words)
words.head(3)

21                                       why why 4minute
262    노년의 건강을 지켜주는 음악 처방전 부제 휴식 때 들으면 좋은 음악친구 bio so...
399                     뽀로로 겨울 동요 초코 초코 핫초코 아이코닉스 iconix
dtype: object

In [None]:
key_words = '|'.join(
    ['어린이|아기|아가|태교|자장가|동요|귤|산모|kid|child',
     '노년|테라피|휴식|명상|숙면|건강|새소리|자살|요가|명상|힐링|바람|asmr', 
     '크리스마스|캐롤|christmas|carol|징글벨|xmas|루돌프|rudolph|santa', 
     'ost|시네마|영화|toystory|toy story|disney|낭만닥터', # ost
     '클래식|베토벤|쇼팽|piano|체르니|악보|orchestra|major|minor|전주곡', 
     '뉴에이지|피아노|이루마|닐케이|선샤인|오르골|마음|여행|아름|고요|분위기|기분|자연|계절|추억|편안|빗소리|상쾌|불안|성찰|바람', 
     '재즈|jazz',
     '뮤지컬', 
     '성인|가요|트롯|트로트|조용필|태진아', # 성인가요
     '흥부|놀부|궁중', # 국악
     '골방라이브|maria|모세|주님|그리스도', # CCM
     '스님|그리스도|예수|목탁|찬송가|성경', # 종교
     '댄스|dance|party|응원|ioah|이효리|생일|대학|대회|중계|뮤직섬', # 댄스
     'pop|마이클잭슨|michael jackson', # 팝
     '팝|최유리', # 팝
     '지미 브라운|브라운 아이드 소울', # R&B'
     '브금|bgm|몽환|환상', # 브금
     '김필|정준일|스텔라장|박원|윤종신|박혜원|소유|한동근|에릭남|조관우|백지영|어반자카파|성시경|폴킴|임한별|길구봉구|테이|정승환|김장훈|김민종|엠씨더맥스|박경태', # 발라드
     'coldplay|노브레인|오월|호피폴라|혁오|넬|상상밴드|밴디지|엔플라잉|윤하|수호|새소년|rock', # 록
     '스윙스|식케이|딘딘|슬리피|블랙나인|딥플로우|보이비|딥플로우|노엘|noel|우원재|mc몽|기리보이|프라이머리|펀치|hash swan', # 힙합
     '어쿠스틱|acoustic|데이식스|윤딴딴|심규선|박새별|선우정아|스웨덴세탁소|크루셜스타|빈센트블루|백예린|장재인|봄여름가을겨울|브로콜리너마저|멜로망스', # 인디
     '아이돌|kard|방탄소년단|아스트로|펜타곤|서사무엘|여자아이들|비투비|강다니엘|2pm|브라운아이드걸스|여자친구|에버글로우|원더나인|핫펠트|뉴이스트|박봄|온리원오브|갓세븐|갓세븐|최강창민|4minute|에이프릴|헨리|exo|김동준|젝스키스|온앤오프|henry|에이티즈' # 아이돌
    ]
)
words[~words.str.contains(key_words, regex = True)].unique()[:10]

array(['tropical lullaby kalua lullaby the moonlighters',
       'open it up open it up tchengiz', '66 how are you 유노이아 eunoia',
       'festivals the collection streaming ver stuck on repeat little boots',
       'sing casey jones and other railroad songs choo choo charlie choo choo charlie and his family',
       'episode ii who feat chillin homie bully da ba tard oceanfromtheblue',
       'running running pihai ryan allyson chen yonien rgry',
       'only jesus new york sessions only jesus new york sessions casting crowns',
       'a night at the met and the future robin williams',
       'righteous wrath 7 the righteous wrath of an honorable man colin stetson'],
      dtype=object)

In [None]:
def genre_replace(df):
  code_regex_pair = [
      ('GN0100', '김필|정준일|스텔라장|박원|윤종신|박혜원|소유|한동근|에릭남|조관우|백지영|어반자카파|성시경|폴킴|임한별|길구봉구|테이|정승환|김장훈|김민종|엠씨더맥스|박경태'),
      ('GN0200', '댄스|dance|party|응원|ioah|이효리|생일|대학|대회|중계|뮤직섬'),
      ('GN0300', '스윙스|식케이|딘딘|슬리피|블랙나인|딥플로우|보이비|딥플로우|노엘|noel|우원재|mc몽|기리보이|프라이머리|펀치|hash swan'),
      ('GN0400', '지미 브라운|브라운 아이드 소울'),
      ('GN0500', '어쿠스틱|acoustic|데이식스|윤딴딴|심규선|박새별|선우정아|스웨덴세탁소|크루셜스타|빈센트블루|백예린|장재인|봄여름가을겨울|브로콜리너마저|멜로망스|최유리'),
      ('GN0600', '노브레인|오월|호피폴라|혁오|넬|상상밴드|밴디지|엔플라잉|윤하|수호|새소년'),
      ('GN1000', 'rock|coldplay'),
      ('GN1500', 'ost|시네마|영화|toystory|toy story|disney|낭만닥터'),
      ('GN1600', '클래식|베토벤|쇼팽|piano|체르니|악보|orchestra|major|minor|전주곡'),
      ('GN1800', '뉴에이지|피아노|이루마|닐케이|선샤인|오르골|마음|여행|아름|고요|분위기|기분|추억|편안|빗소리|상쾌|불안|성찰'),
      ('GN1100', '브금|bgm|몽환|환상'),
      ('GN1700', '재즈|jazz'),
      ('GN2100', '골방라이브|maria|모세|주님|그리스도'),
      ('GN2200', '어린이|아기|아가|태교|자장가|동요|귤|산모|kid|child'),
      ('GN2300', '스님|그리스도|예수|목탁|찬송가|성경'),
      ('GN2400', '흥부|놀부|궁중'),
      ('GN2500', '아이돌|kard|방탄소년단|아스트로|펜타곤|서사무엘|여자아이들|비투비|강다니엘|2pm|브라운아이드걸스|여자친구|에버글로우|원더나인|핫펠트|뉴이스트|박봄|온리원오브|갓세븐|갓세븐|최강창민|4minute|에이프릴|헨리|exo|김동준|젝스키스|온앤오프|henry|에이티즈'),
      ('GN2800', '노년|테라피|휴식|명상|숙면|건강|새소리|자살|요가|명상|힐링|자연|계절|바람|asmr'),
      ('GN2900', '뮤지컬'),
      ('GN3000', '크리스마스|캐롤|christmas|carol|징글벨|xmas|루돌프|rudolph|santa'),
      ('GN0700', '성인|가요|트롯|트로트|조용필|태진아')
  ]
  for code, regex in tqdm(code_regex_pair):
    idx = words.str.contains(regex, na = False).index
    df.loc[idx, 'song_gn_gnr_basket'] = df.loc[idx, 'song_gn_gnr_basket'].apply(lambda x: x + [code])
  return df

In [None]:
song_meta = genre_replace(song_meta)
song_meta.head(3)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2


## 3.3 곡 메타데이터 아티스트 전처리

In [None]:
anormal_artist = song_meta.assign(
    artist_id_cnt = lambda x: x.artist_id_basket.apply(len),
    artist_cnt = lambda x: x.artist_name_basket.apply(len)
).query('artist_id_cnt != artist_cnt')

anormal_artist.head(3)

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id,artist_id_cnt,artist_cnt
6238,[GN0901],20010403,Liberty City Fla.,45928,[56936],I Met Her In Miami,[GN0900],"[Liberty, City]",6238,1,2
15100,[GN0901],20071214,아름다운 드라이브 Cinema & Musical,362455,[232053],Think Of Me,[GN0900],"[Los, Dioses]",15100,1,2
64162,"[GN1601, GN1602, GN1613]",20180216,50 Violin Masterworks,4442865,"[174433, 1543263, 1459364, 2240313]","Richter: Recomposed By Max Richter: Vivaldi, T...",[GN1600],"[Daniel Hope, Christian Badzura, Jane Berthe, ...",64162,4,5


In [None]:
def artist_replace(df):
  artist_replace_pair = [("'Liberty', 'City'", ['Liberty City']),
                         ("'Los', 'Dioses'", ['Los Dioses']),
                         ("'Cast', 'Of The Lodge'", ['Cast Of The Lodge']),
                         ("'Anna Palina', '', 'Draupner'", ['Anna Palina', 'Draupner']),
                         ("'A`Typisk', ''", ['A`Typisk']),
                         ("'The', 'Beat Junkies'", ['The Beat Junkies']),
                         ("'The Randy Watson Experience &', 'Bilal'", ['The Randy Watson Experience & Bilal']),
                         ("'Daniel Hope', 'Christian Badzura', 'Jane Berthe', 'Z&#252;rcher', 'Kammerorchester'", ['Daniel Hope', 'Christian Badzura', 'Jane Berthe', 'Kammerorchester']),
                         ("'Victoria De Los', 'Angeles, Orchestre De L`Opera De Rome, Giuseppe Morelli'", ['Victoria De Los Angeles & Orchestre De L`Opera De Rome & Giuseppe Morelli']),
                         ("'Henri Lanz', 'Skinny Williams', 'WILLIAM RAPPAPORT', ''", ['Henri Lanz', 'Skinny Williams', 'WILLIAM RAPPAPORT'])]
  artist_str = df.artist_name_basket.astype(str).str
  for pair in tqdm(artist_replace_pair):
    anormal_idx = df[artist_str.contains(pair[0])].index
    df.loc[anormal_idx, 'artist_name_basket'] = df.loc[anormal_idx, 'artist_name_basket'].apply(lambda x: pair[1])
  return df

In [None]:
song_meta = artist_replace(song_meta)
song_meta.head(3)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2


## 3.4 곡 메타데이터 결측 처리

In [None]:
print(song_meta.isna().sum(axis = 0))

song_gn_dtl_gnr_basket    0
issue_date                0
album_name                4
album_id                  0
artist_id_basket          0
song_name                 0
song_gn_gnr_basket        0
artist_name_basket        0
id                        0
dtype: int64


In [None]:
def fill_missing(df):
  df['album_name'] = df.album_name.fillna('')
  return df

In [None]:
song_meta = fill_missing(song_meta)

## 3.5 곡 메타데이터 자연어 처리

In [None]:
artist_id = song_meta.artist_id_basket.explode()
artist_name = song_meta.artist_name_basket.explode()

In [None]:
artist_name = artist_name.str.lower()
artist_dic = {i: n for i, n in zip(artist_id, artist_name)}

In [None]:
def song_ppc(df):
  df['album_name'] = word_ppc(df.album_name)
  df['song_name'] = word_ppc(df.song_name)
  df['artist_name_basket'] = df.artist_id_basket.apply(lambda x: [artist_dic[w] for w in x])
  return df

In [None]:
song_meta = song_ppc(song_meta)
song_meta.head(3)

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 7080 추억의 얄개시대 팝송베스트,2255639,[2727],feelings,[GN0900],[various artists],0
1,"[GN1601, GN1606]",20080421,bach partitas nos 2 3 4,376431,[29966],bach partita no 4 in d major bwv 828 ii allemande,[GN1600],[murray perahia],1
2,[GN0901],20180518,hit,4698747,[3361],solsbury hill remastered 2002,[GN0900],[peter gabriel],2


## 3.6 곡 메타데이터 장르 추가

In [None]:
def add_gnr_keyword(df):
  genre_dic = {genre.at[idx, 'gnr_code']: genre.at[idx, 'gnr_basket'] for idx in genre.index}
  song_gnr_keywords = []
  for idx in tqdm(df.index):
    gnr_info = df.at[idx, 'song_gn_dtl_gnr_basket'] + df.at[idx, 'song_gn_gnr_basket']
    gnr_names = list({n for g in gnr_info for n in genre_dic[g]})
    song_gnr_keywords.append(gnr_names)
  song_gnr_keywords = pd.Series(song_gnr_keywords)
  df['song_gnr_keywords'] = song_gnr_keywords
  return df

In [None]:
song_meta = add_gnr_keyword(song_meta)
song_meta.head(3)

HBox(children=(FloatProgress(value=0.0, max=707989.0), HTML(value='')))




Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id,song_gnr_keywords
0,[GN0901],20140512,불후의 명곡 7080 추억의 얄개시대 팝송베스트,2255639,[2727],feelings,[GN0900],[various artists],0,"[국외, 팝]"
1,"[GN1601, GN1606]",20080421,bach partitas nos 2 3 4,376431,[29966],bach partita no 4 in d major bwv 828 ii allemande,[GN1600],[murray perahia],1,"[독주곡, 클래식]"
2,[GN0901],20180518,hit,4698747,[3361],solsbury hill remastered 2002,[GN0900],[peter gabriel],2,"[국외, 팝]"


## 3.7 곡 메타데이터 선호도 및 인기도 추가

In [None]:
def add_like_cnt(df):
    like_cnt = train[['songs', 'like_cnt']].assign(like_cnt = lambda x: x.like_cnt / x.songs.apply(len))
    like_cnt = like_cnt.explode('songs').dropna()
    like_cnt = like_cnt.groupby('songs', as_index = False)[['like_cnt']].agg('sum')
    like_cnt['like_cnt'] = like_cnt.like_cnt.apply(np.log1p)
    like_cnt['like_cnt'] = like_cnt.like_cnt / np.max(like_cnt.like_cnt)
    merged = pd.merge(
        df, like_cnt,
        how = 'left', left_on = 'id', right_on = 'songs'
    ).fillna(0)
    return merged.drop(columns = 'songs')

def add_popular(df):
    popular = train[['songs', 'id']].explode('songs').dropna()
    popular = popular.groupby('songs', as_index = False)[['id']].agg('count')
    popular = popular.rename(columns = {'id': 'popular'})
    popular['popular'] = popular.popular.apply(np.log1p)
    popular['popular'] = popular.popular / np.max(popular.popular)
    merged = pd.merge(
        df, popular,
        how = 'left', left_on = 'id', right_on = 'songs'
    ).fillna(0)
    return merged.drop(columns = 'songs')

In [None]:
song_meta = add_like_cnt(song_meta)
song_meta = add_popular(song_meta)
song_meta.head(3)

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id,song_gnr_keywords,like_cnt,popular
0,[GN0901],20140512,불후의 명곡 7080 추억의 얄개시대 팝송베스트,2255639,[2727],feelings,[GN0900],[various artists],0,"[국외, 팝]",0.058775,0.142951
1,"[GN1601, GN1606]",20080421,bach partitas nos 2 3 4,376431,[29966],bach partita no 4 in d major bwv 828 ii allemande,[GN1600],[murray perahia],1,"[독주곡, 클래식]",0.0,0.0
2,[GN0901],20180518,hit,4698747,[3361],solsbury hill remastered 2002,[GN0900],[peter gabriel],2,"[국외, 팝]",0.0,0.0


In [None]:
song_meta.to_json(base_path + 'data/song_meta_v1.json', orient = 'records', force_ascii = False)