In [19]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

my_path = '/content/notebooks'
try: os.symlink('/content/drive/My Drive/Colab Notebooks/my_env', my_path)
except: print('\nAlready linked...')
sys.path.insert(0, my_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Already linked...


In [20]:
import re, glob
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from khaiii import KhaiiiApi
import sentencepiece as spm

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

In [21]:
display(glob.glob('drive/My Drive/kakao_arena/data/*'))
base_path = 'drive/My Drive/kakao_arena/'

['drive/My Drive/kakao_arena/data/val.json',
 'drive/My Drive/kakao_arena/data/test.json',
 'drive/My Drive/kakao_arena/data/genre_gn_all.json',
 'drive/My Drive/kakao_arena/data/train.json',
 'drive/My Drive/kakao_arena/data/song_meta.json',
 'drive/My Drive/kakao_arena/data/genre_v1.json',
 'drive/My Drive/kakao_arena/data/test_v1.json',
 'drive/My Drive/kakao_arena/data/val_v1.json',
 'drive/My Drive/kakao_arena/data/train_v1.json',
 'drive/My Drive/kakao_arena/data/song_meta_v1.json',
 'drive/My Drive/kakao_arena/data/val_v2.json',
 'drive/My Drive/kakao_arena/data/test_v2.json',
 'drive/My Drive/kakao_arena/data/train_v2.json',
 'drive/My Drive/kakao_arena/data/song_meta_v2.json']

In [22]:
train = pd.read_json(base_path + 'data/train_v1.json')
valid = pd.read_json(base_path + 'data/val_v1.json')
test = pd.read_json(base_path + 'data/test_v1.json')
song_meta = pd.read_json(base_path + 'data/song_meta_v1.json')
genre = pd.read_json(base_path + 'data/genre_v1.json')

1. 곡 장르 키워드와 태그 popularity를 활용하여 곡 키워드 추출
2. album name, song name, plylst title에서 tokenizing의 가장 큰 문제는 불용언(the, a, and 등)이 많다는 것
3. popularity가 낮은 artist 제거, album_id는?

In [23]:
# total = pd.concat([train.assign(group = 'train'),
#                    valid.assign(group = 'valid'),
#                    test.assign(group = 'test')], axis = 0).reset_index(drop = True)
# total.head(3)

In [24]:
song_meta = song_meta.rename(columns = {'id': 'songs', 'song_gn_dtl_gnr_basket': 'song_gnr_dtl_basket', 'song_gn_gnr_basket': 'song_gnr_basket'})
song_meta.head(3)

Unnamed: 0,song_gnr_dtl_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gnr_basket,artist_name_basket,songs,song_gnr_keywords,like_cnt,popular
0,[GN0901],20140512,불후의 명곡 7080 추억의 얄개시대 팝송베스트,2255639,[2727],feelings,[GN0900],[various artists],0,"[국외, 팝]",0.058775,0.142951
1,"[GN1601, GN1606]",20080421,bach partitas nos 2 3 4,376431,[29966],bach partita no 4 in d major bwv 828 ii allemande,[GN1600],[murray perahia],1,"[독주곡, 클래식]",0.0,0.0
2,[GN0901],20180518,hit,4698747,[3361],solsbury hill remastered 2002,[GN0900],[peter gabriel],2,"[국외, 팝]",0.0,0.0


In [25]:
tags = train.tags.explode().dropna().str.lower()
titles = train.plylst_title.dropna()
album_names = song_meta.album_name.dropna()
song_names = song_meta.song_name.dropna()
artist_names = song_meta.artist_name_basket.explode().dropna()
gnr_keywords = song_meta.song_gnr_keywords.explode().dropna().unique()

In [26]:
def remove_blanks(List):
    List = list(map(lambda x: x.strip(), List))
    List = list(filter(lambda x: x != '', List))
    return List

In [27]:
tags, titles, album_names, song_names, artist_names, gnr_keywords = map(remove_blanks, (tags, titles, album_names, song_names, artist_names, gnr_keywords))

## 1.1 Tokenizing

In [28]:
def remove_tails(List):
    tails = ['들', '만', '은', '는', '한']
    tails = [w + '$' for w in tails]
    p = re.compile('|'.join(tails))
    return [p.sub('', w) for w in List]

def remove_heads(List):
    heads = ['19', '20']
    heads = ['^' + w for w in heads]
    p = re.compile('|'.join(heads))
    return [p.sub('', w) for w in List]

def replace_specifics(List):
    specifics = {'ballad': '발라드', 'dance': '댄스', 'rab': '랩', 'hiphop': '힙합', 'vocal': '보컬', 'newage': '뉴에이지', 'jazz': '재즈', 'classic': '클래식',
                 'rock': '록', '락': '록', 'metal': '메탈', 'alternative': '얼터너티브', 'pop': '팝', 'nu metal': '뉴 메탈', 'hard rock': '하드 록', 'indie': '인디',
                 'post': '포스트', 'club': '클럽', 'electronic': '일렉트로니카', 'musical': '뮤지컬', 'r&b': 'rnb', 'billboard': '빌보드', 'christmas': '크리스마스',
                 'cello': '첼로', 'piano': '피아노'}
    for r1, r2 in specifics.items():
        List = [w.replace(r1, r2) for w in List]    
    return List

In [29]:
tokenizer = KhaiiiApi()
def get_token(words):
    if not words:
        return []
    if words  == '':
        return []
    result = tokenizer.analyze(words)
    result = [morph.lex for split in result for morph in split.morphs if morph.tag in ['NNG', 'NNP', 'SL']]
    return result

def tokenizing(sentences):
    result = [get_token(words) for words in tqdm(sentences)]
    result = [token for tokens in result for token in tokens]
    return result

In [30]:
tags = remove_tails(tags)
tags = remove_heads(tags)
tags = replace_specifics(tags)
tag_cnts = pd.value_counts(tags)

In [32]:
common_tags = tag_cnts[tag_cnts > tag_cnts.mean() + tag_cnts.std()].index
print(len(common_tags))

185


In [33]:
tags_gnrs = list(common_tags) + list(gnr_keywords)
tags_gnrs = list(set(tags_gnrs))
tags_gnrs = sorted(tags_gnrs, key = len, reverse = True)
print(len(tags_gnrs))

295


In [34]:
# valid_single_words 
vsw = [w for w in tags_gnrs if len(w) == 1]
print(vsw)

['랩', '썸', 'j', '봄', '록', '팝', '비', '눈', '뉴', '밤']


In [35]:
def remove_stopwords(List):
    stopwords = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for',
                 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
                 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 
                 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 
                 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 
                 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 
                 '의', '안', '며', '을', '를', '한', '은', '는', '이', '가', '과', '와', '할', '수', '로', '도', '으로', '것', '야', '로서', '로써', '아', '된', '면',
                 '그', '그리고', '그런데', '그렇게', '그러나', '이러한', '이나', '이는', '이며', '이고', '이지만', '인데', '에', '에는', '에도', '에서', '곧', '및', '까지',
                 '같', '같이', '같다', '또는', '것임', '전', '중인', '부터', '되', '되어', '되서', '되었', '되는', '돼', '않', '않은', '않는', '있', '있다', '있어', '있는', '있으', 
                 '따르', '따라서', '따른', '따라', '다만', '다고', '다음', '이다', '더', '때문', '보다', '보여', '보인', '위해서', '위하여', '관한', 
                 '하자', '하려', '하지만', '하는', '하더', '하여', '하지', '하기', '하게', '하고', '없으', '위하', '때', '등', '거나', '동', '호', '권', '일부', '인', '위', '주', '음']
    List = list(filter(lambda x: x not in stopwords, List))
    List = [w for w in List if len(w) > 1 or w in vsw]
    return List

def remove_specifics(List):
    specifics = ['mixed', 'remixes', 'chris', 'turn', 'golden', 'et', 'di', 'al', 'con', 'la', 'un', 'da', 'ep', 'ii', 'vii', 'iii', 'iv', 'vi', 'en', 've', 'll', 'er', 'und', 'les', 'alla', 'je', 'sa',
                 'ma', 'il', 'lil', 'oy', 'th', 'nd', 'mi', 'du', 'st', 'op', 'le', 'de', 'te', 'yo', 'rd', 'el', 're', 'vs', 'oh', 'fi', 'ay', 'ain', 'des', 'ver', 'ave', 'que', 'lee', 'der', 'us', 'rv',
                 'hi', 'nos', 'arr', 'hits', 'missing', 'stars', 'children', 'boys', 'nights', 'remastered', 'lights', 'classical', 'classics', 'recordings', 'waiting', 'girls', 'strings', 'works', 
                 'sounds', 'volume', 'lovers', 'colors', 'sonatas', 'presents', 'kids', 'sings', 'tracks', 'concertos', 'days', 'friends', 'plays', 'years', 'etudes', 'things', 'singles', 'seasons',
                 'times', 'songs', 'loves', 'dreams', 'always', 'comes']
    return list(filter(lambda x: x not in specifics, List))

def tkz_col(List):
    p = re.compile('|'.join([w for w in tags_gnrs if len(w) > 1]))
    List = replace_specifics(List)
    List = [p.sub('', w).strip() for w in List]
    List = tokenizing(List)
  
    w_cnts = pd.value_counts(List)
    cut = w_cnts.mean() + w_cnts.std()
    List = w_cnts[w_cnts > cut].index
    List = remove_stopwords(List)
    return List

In [36]:
common_titles = tkz_col(titles)

HBox(children=(FloatProgress(value=0.0, max=114770.0), HTML(value='')))




In [37]:
common_albums = tkz_col(album_names)

HBox(children=(FloatProgress(value=0.0, max=707835.0), HTML(value='')))




In [38]:
common_songs = tkz_col(song_names)

HBox(children=(FloatProgress(value=0.0, max=707913.0), HTML(value='')))




In [39]:
tokens = list(common_titles) + list(common_albums) + list(common_songs)
tokens = list(set(tokens))
print(len(tokens))

876


In [40]:
tokens = remove_tails(tokens)
tokens = remove_heads(tokens)
tokens = remove_stopwords(tokens)
tokens = remove_specifics(tokens)
len(tokens)

778

In [41]:
keywords = list(set(tokens + tags_gnrs))
keywords = sorted(keywords, key = len, reverse = True)
len(keywords)

1065

In [42]:
def extract_keywords(Series, col_name):
    Series = pd.Series(replace_specifics(Series))
    return Series.str.extractall(f'(?P<{col_name}>{"|".join(keywords)})').groupby(level = 0)[f'{col_name}'].agg(list)

In [43]:
%%time
album_name_keywords = extract_keywords(song_meta.album_name, 'album_name_basket')
song_name_keywords = extract_keywords(song_meta.song_name, 'song_name_basket')

CPU times: user 3min 5s, sys: 294 ms, total: 3min 5s
Wall time: 3min 5s


In [44]:
def make_song_keyword(df):
    df = df.join(song_name_keywords, how = 'left')
    df = df.join(album_name_keywords, how = 'left')
    df = df.fillna('NAN').applymap(lambda x: [] if x == 'NAN' else x)
    return df

In [45]:
song_meta = make_song_keyword(song_meta)
song_meta.head(3)

Unnamed: 0,song_gnr_dtl_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gnr_basket,artist_name_basket,songs,song_gnr_keywords,like_cnt,popular,song_name_basket,album_name_basket
0,[GN0901],20140512,불후의 명곡 7080 추억의 얄개시대 팝송베스트,2255639,[2727],feelings,[GN0900],[various artists],0,"[국외, 팝]",0.058775,0.142951,[feeling],"[불후, 명곡, 추억, 시대, 팝송, 베스트]"
1,"[GN1601, GN1606]",20080421,bach partitas nos 2 3 4,376431,[29966],bach partita no 4 in d major bwv 828 ii allemande,[GN1600],[murray perahia],1,"[독주곡, 클래식]",0.0,0.0,"[bach, part, major, bwv, man]","[bach, part]"
2,[GN0901],20180518,hit,4698747,[3361],solsbury hill remastered 2002,[GN0900],[peter gabriel],2,"[국외, 팝]",0.0,0.0,[remaster],[hit]


In [46]:
def make_title_basket(df):
    plylist_keywords = extract_keywords(df.plylst_title, 'title_basket')
    df = df.join(plylist_keywords, how = 'left')
    df = df.fillna('NAN').applymap(lambda x: [] if x == 'NAN' else x)
    return df

In [47]:
def make_tag_basket(df):
    tag_org = pd.Series(df.tags.explode().dropna().unique())
    tag_kwd = extract_keywords(tag_org, 'tag_basket')
    tag_kwd_dic = {}
    for i in tqdm(range(tag_org.size)):
        try:
            tag_kwd_dic[tag_org[i]] = tag_kwd[i]
        except:
            tag_kwd_dic[tag_org[i]] = []
    df['tag_basket'] = df.tags.apply(lambda x: [z for y in x for z in tag_kwd_dic[y]])
    return df

In [48]:
train, valid, test = map(make_title_basket, (train, valid, test))
train, valid, test = map(make_tag_basket, (train, valid, test))

HBox(children=(FloatProgress(value=0.0, max=29160.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4875.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2824.0), HTML(value='')))




In [49]:
train.to_json(base_path + 'data/train_v2.json', orient = 'records', force_ascii = False)
valid.to_json(base_path + 'data/val_v2.json', orient = 'records', force_ascii = False)
test.to_json(base_path + 'data/test_v2.json', orient = 'records', force_ascii = False)

## 1.2 Remove low-reveal artists

In [50]:
artist_cnts = song_meta.artist_id_basket.explode().dropna().value_counts()
common_artists = artist_cnts[artist_cnts > 1].index
common_artists = [artist for artist in common_artists if artist != 2727] # 2727 is various artists
print(len(common_artists))

58956


In [51]:
artist_id_dic = {i: n for i, n in zip(song_meta.artist_id_basket.explode(), song_meta.artist_name_basket.explode())}

In [52]:
def remove_artists(df):
  df['artist_id_basket'] = df.artist_id_basket.apply(lambda x: [i for i in x if i in common_artists])
  df['artist_name_basket'] = df.artist_id_basket.apply(lambda x: [artist_id_dic[i] for i in x if i in common_artists])
  return df

In [53]:
song_meta = remove_artists(song_meta)
song_meta.head(3)

Unnamed: 0,song_gnr_dtl_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gnr_basket,artist_name_basket,songs,song_gnr_keywords,like_cnt,popular,song_name_basket,album_name_basket
0,[GN0901],20140512,불후의 명곡 7080 추억의 얄개시대 팝송베스트,2255639,[],feelings,[GN0900],[],0,"[국외, 팝]",0.058775,0.142951,[feeling],"[불후, 명곡, 추억, 시대, 팝송, 베스트]"
1,"[GN1601, GN1606]",20080421,bach partitas nos 2 3 4,376431,[29966],bach partita no 4 in d major bwv 828 ii allemande,[GN1600],[murray perahia],1,"[독주곡, 클래식]",0.0,0.0,"[bach, part, major, bwv, man]","[bach, part]"
2,[GN0901],20180518,hit,4698747,[3361],solsbury hill remastered 2002,[GN0900],[peter gabriel],2,"[국외, 팝]",0.0,0.0,[remaster],[hit]


In [54]:
song_meta.to_json(base_path + 'data/song_meta_v2.json', orient = 'records', force_ascii = False)