## Kakao arena 2nd Competition
# "브런치 사용자를 위한 글 추천 대회"
### brunch 데이터를 활용해 사용자의 취향에 맞는 글을 예측하는 대회
* 공식 홈페이지: https://arena.kakao.com/c/2
* 베이스 코드: https://github.com/kakao-arena/brunch-article-recommendation

### BrunchRec 
* designed by **datartist**
* 깃헙 주소: https://github.com/jihoo-kim/BrunchRec  

## 1. 라이브러리 및 원본데이터

In [1]:
## 라이브러리 불러오기

import numpy as np
import pandas as pd
import os
import glob
import time
import pickle
import datetime
import warnings
from itertools import chain
from collections import Counter

directory = './res/'
warnings.filterwarnings(action='ignore')

In [2]:
## 원본데이터 불러오기

# users  // DataFrame (310758, 3)
users = pd.read_json(directory + '/users.json', lines=True)

# magazine  // DataFrame (27967, 2)
magazine = pd.read_json(directory + 'magazine.json', lines=True)

# metadata  // DataFrame (643104, 9)
metadata = pd.read_json(directory + 'metadata.json', lines=True)

# dev.users  // List (3000)
f = open('./res/predict/dev.users')
dev_users = f.read().splitlines()
f.close()

# test.users  // List (5000)
f = open('./res/predict/test.users')
test_users = f.read().splitlines()
f.close()

# read  // DataFrame (3507097, 5)
read_file_lst = glob.glob('./res/read/*')
exclude_file_lst = ['read.tar']
read_df_lst = []
for f in read_file_lst:
    file_name = os.path.basename(f)
    if file_name in exclude_file_lst:
        print(file_name)
    else:
        df_temp = pd.read_csv(f, header=None, names=['raw'])
        df_temp['dt'] = file_name[:8]
        df_temp['hr'] = file_name[8:10]
        df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
        df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
        read_df_lst.append(df_temp)
read = pd.concat(read_df_lst)

# read_raw  // DataFrame (22110706, 4)
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))

read_cnt_by_user = read['article_id'].str.split(' ').map(len)

read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                         'hr': np.repeat(read['hr'], read_cnt_by_user),
                         'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                         'article_id': chainer(read['article_id'])})

## 2. 전처리 관련 함수

In [3]:
## read_raw 'dt' 전처리 (날짜 컬럼의 자료형 변환 string -> int)
def read_raw_dt_preprocessing(read_raw_df):
    
    if type(read_raw['dt'].values[0]) == type('string'):
        dt = read_raw_df['dt'].tolist()
        read_raw_df['dt'] = [int (i) for i in dt]
        print('preprocessing completed!')

    else:
        print('already preprocessed!')
        
    return read_raw_df

In [4]:
## metadata 'view' 전처리 (전체기간 동안의 조회수 view 저장)
def metadata_view_preprocessing(metadata_df, read_raw_df):
    
    if 'view' in metadata.keys():
        print('already preprocessed!')
        
    else:
        view = read_raw_df.groupby('article_id').count()['user_id']
        view_df = pd.DataFrame({'id':view.index, 'view':view.values})
        metadata_df = pd.merge(metadata_df, view_df, how='left', on='id')
        metadata_df['view'] = metadata_df['view'].fillna(0)
        print("preprocessing completed!")

    return metadata_df

In [5]:
## target_df 생성 (users에 없는 user_id 추가)
def target_df_generator(target_users_list, users_df):
    
    target_df = users_df[users_df['id'].isin(target_users_list)]
    
    for target_user in target_users_list:
        if (target_user in target_df['id'].tolist()) == False:
            new_df = pd.DataFrame({'following_list':[[]], 'id':[target_user], 'keyword_list':[[]]})
            target_df = target_df.append(new_df)
    print("preprocessing completed!")
    
    return target_df

In [6]:
## target_df 'read' 전처리 (전체 기간 동안 target user가 본 article을 저장)
def target_read_article(target_df, read_raw_df, file):
    
    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            target_read_article = pickle.load(fr)
        print("target_read_article file loaded..")

        target_df['read'] = target_read_article
        print("preprocessing completed!")
    
    else:
        target_read_article = []
        time = 0
        
        # target user가 전체 기간 동안 본 글의 article_id 저장 (중복 허용)
        for idx in target_df['id'].values.tolist():
            read_list = read_raw_df[read_raw_df['user_id']==idx]['article_id'].values.tolist()
            target_read_article.append(read_list)

            # 진행 상황 표시
            time += 1
            print(time, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_article, fw)
        print("target_read_article file saved..")
        
        target_df['read'] = target_read_article
        print("preprocessing completed!")
            
    return target_df

In [7]:
# target_df 'magazine' 전처리 (전체 기간 동안 target user가 본 magazine 저장)
def target_read_magazine(target_df, metadata_df, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_magazine = pickle.load(fr)
        print("target_read_magazine file loaded..")

        target_df['read_magazine'] = target_read_magazine
        print("preprocessing completed!")
    
    else:
        target_read_magazine = []
        time = 0

        for idx in target_df['id'].values.tolist():
            # target user가 읽은 글의 magazine_id 저장
            magazine_list = []
            target_read = target_df[target_df['id']==idx]['read'].values[0][:]
            magazine_list = metadata_df[metadata_df['id'].isin(target_read)]['magazine_id'].tolist()
            
            # magazine_id 빈도수 저장 (magazine이 아닌 0은 제외)
            magazine_frequency = Counter(magazine_list)
            del magazine_frequency[0]
            target_read_magazine.append(magazine_frequency)

            # 진행 상황 표시
            time += 1
            print(time, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_magazine, fw)
        print("target_read_magazine file saved..")
        
        target_df['read_magazine'] = target_read_magazine
        print("preprocessing completed!")
    
    return target_df

In [8]:
## target_df 'tag' 전처리 (일정 기간 동안 target user가 본 글의 태그 빈도수 저장)
def target_read_tag(target_df, metadata_df, read_raw_df, from_dt, to_dt, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_tag = pickle.load(fr)
        print("target_read_tag file loaded..")

        target_df['read_tag'] = target_read_tag
        print("preprocessing completed!")
    
    else:
        target_read_tag = []
        time = 0
        
        # 일정 기간(from_dt ~ to_dt) 동안 target user가 읽은 내역
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]

        for idx in target_df['id'].values.tolist():
            # 각 target user가 일정 기간 동안 읽은 article_id 리스트
            partial_read_list = partial_read[partial_read['user_id']==idx]['article_id'].values.tolist()

            # 각 target user가 일정 기간 동안 읽은 글의 태그 합쳐서 저장 (중복 허용)
            partial_read_tag = []
            
            for i in range(len(partial_read_list)):
                if len(metadata_df[metadata_df['id']==partial_read_list[i]]) > 0:
                    tag = metadata_df[metadata_df['id']==partial_read_list[i]]['keyword_list'].values[0][:]
                    partial_read_tag = partial_read_tag + tag
            
            # 각 target user가 일정 기간 동안 읽은 글의 태그들의 빈도수 저장
            frequency = Counter(partial_read_tag)
            target_read_tag.append(frequency)

            # 진행 상황 표시
            time += 1
            print(time, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_read_tag, fw)
        print("target_read_tag file saved..")
        
        target_df['read_tag'] = target_read_tag
        print("preprocessing completed!")        

    return target_df

In [9]:
## target_df 'interest' 전처리 (read_tag에서 빈도수가 높은 상위 6개 관심키워드 저장)
def target_interest(target_df, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_interest = pickle.load(fr)
        print("target_interest file loaded..")

        target_df['interest'] = target_interest
        print("preprocessing completed!")
    
    else:
        target_interest = []
        time = 0      
        
        # read_tag에서 빈도수가 높은 상위 6개의 키워드 저장
        for idx in target_df['id'].values.tolist():
            interest = []
            
            rt = target_df[target_df['id']==idx]['read_tag'].values[0]
            sorted_rt = sorted(rt.items(), key=lambda x: x[1], reverse=True)
            
            for i in range(len(sorted_rt[:6])):
                interest.append(sorted_rt[:6][i][0])
                
            target_interest.append(interest)
            
            # 진행 상황 표시
            time += 1
            print(time, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_interest, fw)
        print("target_interest file saved..")
        
        target_df['interest'] = target_interest
        print("preprocessing completed!")        

    return target_df

## 3. 추천 관련 함수

In [10]:
# 최근 조회수가 높은 top_N개의 글을 추천
def popularity_based_recommend(r_list, recommended, read_raw_df, from_dt, to_dt, file):
    already = r_list + recommended
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            popularity = pickle.load(fr)
            
    else:
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
        available = partial_read[partial_read['article_id'].str.startswith('@')]
        view = available.groupby('article_id').count()
        popularity = view.sort_values(['user_id'], ascending=[False])['user_id']
        
        # popularity 파일로 저장
        with open(file,"wb") as fw:
            pickle.dump(popularity, fw)
        print("popularity file saved!")
    
    candidate = popularity[popularity.keys().isin(already)==False]
    popularity_based_recommend_list = candidate[:(100-len(recommended))].keys().tolist()

    return popularity_based_recommend_list

In [11]:
# target user가 읽은 글 중에서 구독작가 글의 비율을 고려하여 추천
def following_based_recommend(idx, target_df, metadata_df, r_list, recommended):
    
    already = r_list + recommended
    following_based_recommend_list = []
    
    # f_list -> target user의 following_list (구독작가 리스트)
    f_list = target_df[target_df['id']==idx]['following_list'].values[0][:]  
    for i in range(len(f_list)):
        f_list[i] = f_list[i] + '_'

    r_series = pd.Series(r_list)
    # fr_list -> 소비한 글(r) 중에서 구독작가의 글(fr)
    fr_list = r_series[r_series.str.startswith(tuple(f_list))].tolist()
    # f_ratio -> 소비한 글(r) 중에서 구독작가의 글(fr)의 비율
    f_ratio = len(fr_list) / len(r_list)                                

    if f_ratio > 0:        
        # fr_dic   -> {f_id : fr_ratio}
        # f_id     -> 특정 구독작가의 id
        # fr_ratio -> fr_list 중에서 해당 작가(f_id)의 글의 비율
        fr_dic = {}                                                                         
        for f_id in f_list:                                                                 
            fr_ratio = len(r_series[r_series.str.startswith(f_id)].tolist()) / len(fr_list) 
            if fr_ratio > 0:
                fr_dic[f_id]=fr_ratio
        
        # sorted_fr -> fr_ratio가 높은 순서대로 정렬
        sorted_fr = sorted(fr_dic.items(), key=lambda x: x[1], reverse=True)
        
        for i in range(len(sorted_fr)):
            # n_rec -> 추천할 구독작가의 글의 개수 (fr_ratio가 높을수록 많이 추천됨)
            n_rec = int((100-len(recommended)) * f_ratio * sorted_fr[i][1])                
            fr_article = metadata_df[metadata_df['user_id']==sorted_fr[i][0][:-1]]
            # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
            fr_candidate = fr_article[fr_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()
            following_based_recommend_list = following_based_recommend_list + fr_candidate
    
    return following_based_recommend_list

In [12]:
# target user가 읽은 글 중에서 매거진 글의 비율을 고려하여 추천
def magazine_based_recommend(idx, target_df, metadata_df, r_list, recommended):
    
    already = r_list + recommended
    magazine_based_recommend_list = []
    
    # mr_dic -> {m_id : 빈도수}
    mr_dic = target_df[target_df['id']==idx]['read_magazine'].values[0]
    # m_ratio -> 소비한 글(r) 중에서 매거진의 글(mr)
    m_ratio = sum(mr_dic.values()) / len(r_list)                         
    
    # sorted_mr -> 빈도수 순으로 정렬
    sorted_mr = sorted(mr_dic.items(), key=lambda x: x[1], reverse=True)

    for i in range(len(sorted_mr)):
        # n_rec -> 추천할 매거진 글의 개수 (빈도수가 높을수록 많이 추천됨)
        n_rec = int( (100-len(recommended)) * m_ratio * (sorted_mr[i][1]/sum(mr_dic.values())) )
        mr_article = metadata_df[metadata_df['magazine_id']==sorted_mr[i][0]]
        # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
        mr_candidate = mr_article[mr_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()
        magazine_based_recommend_list = magazine_based_recommend_list + mr_candidate
    
    return magazine_based_recommend_list

In [13]:
# target user가 읽은 글들에서 자주 나오는 태그를 고려하여 추천
def tag_based_recommend(idx, target_df, metadata_df, r_list, recommended):
    
    already = r_list + recommended

    # user_interest -> target user의 interest (0 ~ 6개)
    user_interest = target_df[target_df['id']== idx]['interest'].values[0][:]
    
    # interest_article_id -> target user의 interest와 2개 이상 겹치는 글의 article_id
    interest_article_id = []
    for i in range(len(metadata_df)):
        if len(set(metadata['keyword_list'].values[i]) & set(user_interest)) >= 2:
            interest_article_id.append(metadata['id'][i])

    # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
    n_rec = 100-len(recommended)
    t_article = metadata_df[metadata_df['id'].isin(interest_article_id)]
    tag_based_recommend_list = t_article[t_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()

    return tag_based_recommend_list

In [14]:
def recommender(target_list, target_df, metadata_df, read_raw_df):
    
    recommend_list = []
    time = 0
    
    # metadata_0 -> view가 0인 글을 제외한 metadata
    metadata_0 = metadata_df[metadata_df['view'] > 0]

    for idx in target_list:
        recommended = []

        # r_list -> target user가 이미 본 글 리스트
        r_list = target_df[target_df['id']==idx]['read'].values[0][:]
        
        # following_based 추천
        recommended = recommended + following_based_recommend(idx, target_df, metadata_0, r_list, recommended)
        f_based = len(recommended)
        
        # magazine_based 추천
        recommended = recommended + magazine_based_recommend(idx, target_df, metadata_0, r_list, recommended)
        m_based = len(recommended) - f_based

        # tag_bsaed 추천
        recommended = recommended + tag_based_recommend(idx, target_df, metadata_0, r_list, recommended)
        t_based = len(recommended) - m_based - f_based

        # 100개 되지 않았다면 popularity_based 추천
        if len(recommended) < 100:
            recommended = recommended + popularity_based_recommend(read_raw_df, r_list, recommended, 20190222, 20190228, './pickle/popularity_190222_190228')
        p_based = len(recommended) - t_based - m_based - f_based

        # 추천 리스트 맨 앞에 user_id 추가
        recommended.insert(0, idx)                
        recommend_list.append(recommended)        

        # 진행 상황 표시
        time += 1
        print(time, '/', str(len(target_list)), 'completed,', 'f:'+str(f_based)+',', 'm:'+str(m_based)+',', 't:'+str(t_based)+',', 'p:'+str(p_based)+',', 'total:'+str(len(recommended)-1))

    # 추천 리스트를 파일로 저장
    f = open('recommend.txt', 'w')
    for i in range(len(recommend_list)):
        for j in range(len(recommend_list[i])):
            f.write(recommend_list[i][j])
            if j == (len(recommend_list[i]) - 1):
                continue
            f.write(' ')
        f.write('\n')
    f.close()
    print('recommend.txt file saved..')
    print('completed!')

    return recommend_list

## 4. 메인

### Step 1. metadata & read_raw 전처리

In [15]:
read_raw = read_raw_dt_preprocessing(read_raw)
metadata = metadata_view_preprocessing(metadata, read_raw)

preprocessing completed!
preprocessing completed!


### Step 2. target user 전처리

In [16]:
# dev에 대해 추천할 경우
dev = target_df_generator(dev_users, users)
dev = target_read_article(dev, read_raw, './pickle/dev_read_article')
dev = target_read_magazine(dev, metadata, './pickle/dev_read_magazine')
dev = target_read_tag(dev, metadata, read_raw, 20190222, 20190228, './pickle/dev_read_tag')
dev = target_interest(dev, './pickle/dev_interest')

preprocessing completed!
target_read_article file loaded..
preprocessing completed!
target_read_magazine file loaded..
preprocessing completed!
target_read_tag file loaded..
preprocessing completed!
target_interest file loaded..
preprocessing completed!


In [None]:
# # test에 대해 추천할 경우
# test = target_df_generator(test_users, users)
# test = target_read_article(test, read_raw, file='./pickle/test_read_article')
# test = target_read_magazine(test, metadata, './pickle/test_read_magazine')
# test = target_read_tag(test, metadata, read_raw, 20190222, 20190228, './pickle/test_read_tag')
# test = target_interest(test, './pickle/test_interest')

### Step 3. target user 추천

In [None]:
# dev에 대해 추천할 경우
recommend = recommender(dev_users, dev, metadata, read_raw)

In [None]:
# # test에 대해 추천할 경우
# recommend = recommender(test_users, test, metadata, read_raw)