# Kakao Arena Brunch Posts Recommendation: First Look

In [2]:
import os
import json
from glob import glob
import urllib
import requests

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [70]:
def get_read(path: str) -> pd.DataFrame:
    read = pd.read_csv(path, header=None, names=['log'])
    start_time = int(os.path.basename(path).split('_')[0])

    read['user_private'] = read['log'].apply(lambda x: x.split()).apply(lambda x: x[0])
    read['sequence'] = read['log'].apply(lambda x: x.split()).apply(lambda x: x[1:])
    read['start_time'] = start_time

    return read[['start_time', 'user_private', 'sequence']]


def load(name: str='magazine', root_dir: str='../raw/'):
    PATH = {
        'magazine': os.path.join(root_dir, 'magazine.json'), 
        'metadata': os.path.join(root_dir, 'metadata.json'), 
        'users': os.path.join(root_dir, 'users.json'),
        'dev': os.path.join(root_dir, 'predict/dev.users'),
        'test': os.path.join(root_dir, 'predict/test.users'),
        'read': os.path.join(root_dir, 'read/*')
        }
    
    if name in ['magazine', 'metadata', 'users']:
        data = pd.DataFrame([json.loads(line) for line in open(PATH[name], 'r', encoding='utf-8')])
        if name == 'magazine':
            data = data.rename({'id': 'magazine_id'}, axis=1)
        elif name == 'metadata':
            data.rename({'id': 'post_id'}, axis=1, inplace=True)
        else:
            data = data.rename({'id': 'user_private'}, axis=1)[['user_private', 'following_list', 'keyword_list']]

    elif name in ['dev', 'test']:
        data = pd.read_csv(PATH[name], header=None, names=['user_private'])

    elif name == 'read':
        data = pd.concat([get_read(path) for path in glob(PATH[name])], axis=0, ignore_index=True)

    else:
        raise NotImplementedError()

    return data



In [71]:
magazine = load(name='magazine')
metadata = load(name='metadata')
users = load(name='users')
dev = load(name='dev')
test  = load(name='test')


# # take somewhat long time to load
# contents = [json.loads(line) for line in open("../raw/contents/data.0", 'r', encoding='utf-8')]
# read = load(name='read')

# Users
- user_private
- following_list
- keyword_list: 최근 해당 작가의 유입 키워드
    - keyword_list가 하나도 없는 유저는 작가가 아닐 가능성이 높음
    - 유입 키워드가 많을 수록 인기 있는 작가일 가능성이 높음

In [74]:
users['num_keywords'] = users['keyword_list'].apply(lambda x: len(x))

In [98]:
users

Unnamed: 0,user_private,following_list,keyword_list,num_keywords
0,#901985d8bc4c481805c4a4f911814c4a,"[@perytail, @brunch]",[],0
1,#1fd89e9dcfa64b45020d9eaca54e0eed,"[@holidaymemories, @wadiz, @sciforus, @dailydu...",[],0
2,#1d94baaea71a831e1f33e1c6bd126ed5,"[@commerceguy, @sunsutu, @kakao-it, @joohoonja...",[],0
3,#04641c01892b12dc018b1410e4928c0d,"[@amberjeon48, @forsy20, @nemotokki, @hawann, ...",[],0
4,#65bcaff862aadff877e461f54187ab62,"[@dwcha7342, @iammento, @kakao-it, @dkam, @ant...",[],0
...,...,...,...,...
310753,#2863e47d50f1640df6dac10b7bad94fb,"[@login002, @kkonal, @leeraha, @tobeme, @sohyu...",[],0
310754,#4341a155d1966e5618e310c45386aea4,"[@simplelife-1p, @mint5051, @thecapitalist, @s...",[],0
310755,#0d70f397a78d2ef638f812592fa8e6ba,"[@cometseeker, @bijou, @suhanjang, @brunch]",[],0
310756,#1bbf5e3d1e4c373103981cdd819812da,"[@taekangk, @cielbleu, @yongisa, @joongheekim,...","[{'cnt': 1, 'keyword': '꽃병 꽃꽂이'}, {'cnt': 1, '...",16


# Predict

In [97]:
read[read['user_private'].isin(dev['user_private'].tolist())]

Unnamed: 0,start_time,user_private,sequence
3,2018100100,#b8b9d09fe2961fd62edc94912bf75a90,"[@hyejinchoi_122, @hyejinchoi_86, @hyejinchoi_..."
9,2018100100,#748e26f7662012146a77f589dbbd8d69,"[@onefineday_225, @varo_634, @gradure_871, @on..."
10,2018100100,#76ddc76b38b3a6efa48a63fadf05b62b,[@pliossun_45]
18,2018100100,#9ec904c24c42df16f558f5c4407302e1,"[@pizzakim_20, @pizzakim_20]"
34,2018100100,#38a8b6e043685216c1240cddf481e6b4,[@hee072794_118]
...,...,...,...
3507074,2019022823,#4b0824bc3b78df120fad874dca5fbfec,"[@tenbody_1418, @tenbody_1418, @tenbody_1743, ..."
3507075,2019022823,#e93b4cf38313236426195b6a4e46ece8,[@jijuyeo_13]
3507080,2019022823,#b8f98df48ac08a95665c1e13d680d01e,[]
3507084,2019022823,#f40308439fc936f375a6c399db65e6bd,"[@shrainy80_3, @book-writer_37]"
