## Kakao arena 2nd Competition
# "브런치 사용자를 위한 글 추천 대회"
### brunch 데이터를 활용해 사용자의 취향에 맞는 글을 예측하는 대회
* 공식 홈페이지: https://arena.kakao.com/c/2
* 베이스 코드: https://github.com/kakao-arena/brunch-article-recommendation

### BrunchRec v9.0.3
* designed by **datartist**
* 깃헙 주소: https://github.com/jihoo-kim/BrunchRec  

## 1. 라이브러리 및 원본데이터

In [None]:
## 라이브러리 불러오기
import numpy as np
import pandas as pd
import os
import time
import glob
import pickle
import datetime
from tqdm import tqdm
from itertools import chain
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
## 원본데이터 불러오기
def data_loading(directory, mode):
    
    print('data loading..')
    
    if mode == 'dev':
        # dev.users  // List (3000)
        f = open('./res/predict/dev.users')
        target_users_list = f.read().splitlines()
        f.close()
    else:
        # test.users  // List (5000)
        f = open('./res/predict/test.users')
        target_users_list = f.read().splitlines()
        f.close()

    # users  // DataFrame (310758, 3)
    users = pd.read_json(directory + '/users.json', lines=True)

    # metadata  // DataFrame (643104, 9)
    metadata = pd.read_json(directory + 'metadata.json', lines=True)

    # read  // DataFrame (3507097, 5)
    read_file_lst = glob.glob('./res/read/*')
    exclude_file_lst = ['read.tar']
    read_df_lst = []
    for f in read_file_lst:
        file_name = os.path.basename(f)
        if file_name in exclude_file_lst:
            print(file_name)
        else:
            df_temp = pd.read_csv(f, header=None, names=['raw'])
            df_temp['dt'] = file_name[:8]
            df_temp['hr'] = file_name[8:10]
            df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
            df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
            read_df_lst.append(df_temp)
    read = pd.concat(read_df_lst)

    # read_raw  // DataFrame (22110706, 4)
    read_cnt_by_user = read['article_id'].str.split(' ').map(len)

    read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                             'hr': np.repeat(read['hr'], read_cnt_by_user),
                             'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                             'article_id': list(chain.from_iterable(read['article_id'].str.split(' '))) })
    
    print('data loaded!', '\n')
    
    return target_users_list, users, metadata, read_raw

## 2. 전처리 관련 함수

In [None]:
## 숫자형식의 날짜를 입력하면 unix 시간으로 변환
def get_unix_time(reg_ts):
    string_time = str(reg_ts)[:4]+'-'+str(reg_ts)[4:6]+'-'+str(reg_ts)[6:8]+' 00:00:00'
    unix_time = time.mktime(datetime.datetime.strptime(string_time, '%Y-%m-%d %H:%M:%S').timetuple())*1000
    
    return unix_time

In [None]:
## read_raw 'dt' 전처리 (날짜 컬럼의 자료형 변환 string -> int)
def read_raw_preprocessing(read_raw_df):
    
    if type(read_raw_df['dt'].values[0]) == type('string'):
        dt = read_raw_df['dt'].tolist()
        read_raw_df['dt'] = [int (i) for i in dt]
        print('read_raw preprocessing completed!', '\n')

    else:
        print('already preprocessed!', '\n')
        
    return read_raw_df

In [None]:
## metadata 'view' 전처리 (일정 기간 동안의 조회수 recent_view와 전체 기간 동안의 조회수 view 저장)
def metadata_preprocessing(metadata_df, read_raw_df, from_dt, to_dt):
    
    if 'recent_view' in metadata_df.keys():
        print('already preprocessed!', '\n')
        
    else:
        view = read_raw_df.groupby('article_id').count()['user_id']
        view_df = pd.DataFrame({'id':view.index, 'view':view.values})
        metadata_df = pd.merge(metadata_df, view_df, how='left', on='id')
        metadata_df['view'] = metadata_df['view'].fillna(0)
        
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
        recent_view = partial_read.groupby('article_id').count()['user_id']
        recent_view_df = pd.DataFrame({'id':recent_view.index, 'recent_view':recent_view.values})
        metadata_df = pd.merge(metadata_df, recent_view_df, how='left', on='id')
        metadata_df['recent_view'] = metadata_df['recent_view'].fillna(0)
        print('metadata preprocessing completed!', '\n')

    return metadata_df

In [None]:
## target_df 생성 (users에 없는 user_id 추가)
def target_df_generator(target_users_list, users_df):
    
    target_df = users_df[users_df['id'].isin(target_users_list)]
    
    for target_user in target_users_list:
        if (target_user in target_df['id'].tolist()) == False:
            new_df = pd.DataFrame({'following_list':[[]], 'id':[target_user], 'keyword_list':[[]]})
            target_df = target_df.append(new_df)
    
    return target_df

In [None]:
## target_df 'read' 전처리 (전체 기간 동안 target user가 본 article을 저장)
def target_read_article(target_df, read_raw_df, file):
    
    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            target_read_article = pickle.load(fr)

        target_df['read'] = target_read_article
    
    else:
        print('read_article generating.. ')
        print('It takes long time.. (Please, download \'read_article\' pickle file)')        

        target_read_article = []
        iteration = 0
        
        # target user가 전체 기간 동안 본 글의 article_id 저장 (중복 허용)
        for idx in target_df['id'].values.tolist():
            read_list = read_raw_df[read_raw_df['user_id']==idx]
            read_list = read_list[read_list['article_id'].str.startswith('@')]['article_id'].tolist()
            target_read_article.append(read_list)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_article, fw)
        
        target_df['read'] = target_read_article
            
    return target_df

In [None]:
## target_df 'recent' 전처리 (일정 기간 동안 target user가 본 article을 저장)
def target_recent_article(target_df, read_raw_df, from_dt, to_dt, file):
    
    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            target_recent_article = pickle.load(fr)

        target_df['recent'] = target_recent_article
    
    else:
        print('recent_article generating.. ')
        print('It takes long time.. (Please, download \'recent_article\' pickle file)')     
        
        target_recent_article = []
        iteration = 0
        
        # target user가 일정 기간 동안 본 글의 article_id 저장 (중복 허용)
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
        
        for idx in target_df['id'].values.tolist():    
            read_list = partial_read[partial_read['user_id']==idx]
            read_list = read_list[read_list['article_id'].str.startswith('@')]['article_id'].tolist()
            target_recent_article.append(read_list)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_recent_article, fw)
        
        target_df['recent'] = target_recent_article
            
    return target_df

In [None]:
# target_df 'following' 전처리 (target user가 본 following의 빈도수 저장)
def target_read_following(target_df, mode, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_following = pickle.load(fr)
        target_df[mode+'_following'] = target_read_following
    
    else:
        print('read_following generating.. ')
        print('It takes long time.. (Please, download \'read_following\' pickle file)') 
        
        target_read_following = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # following_frequency -> read_list 중에서 해당 작가(f_id)의 글의 빈도수
            following_frequency = {}
            
            # read_list- > target user의 read_list
            read_list = target_df[target_df['id']==idx][mode].values[0][:]
            
            if len(read_list) > 0:
                read_series = pd.Series(read_list)

                # f_list -> target user의 following_list (구독작가 리스트)
                f_list = target_df[target_df['id']==idx]['following_list'].values[0][:]  
                for i in range(len(f_list)):
                    f_list[i] = f_list[i] + '_'

                for f_id in f_list:
                    frequency = len(read_series[read_series.str.startswith(f_id)].tolist())
                    if frequency > 0:
                        following_frequency[f_id[:-1]]=frequency
                        
            target_read_following.append(following_frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_following, fw)
        target_df[mode+'_following'] = target_read_following
    
    return target_df

In [None]:
# target_df 'magazine' 전처리 (target user가 본 magazine의 빈도수 저장)
def target_read_magazine(target_df, metadata_df, mode, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_magazine = pickle.load(fr)
        target_df[mode+'_magazine'] = target_read_magazine
    
    else:
        print('read_magazine generating.. ')
        print('It takes long time.. (Please, download \'read_magazine\' pickle file)') 
        
        target_read_magazine = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # target user가 읽은 글의 magazine_id 저장
            read_list = target_df[target_df['id']==idx][mode].values[0][:]
            magazine_list = metadata_df[metadata_df['id'].isin(read_list)]['magazine_id'].tolist()
            
            # magazine_id 빈도수 저장 (magazine이 아닌 0은 제외)
            magazine_frequency = Counter(magazine_list)
            del magazine_frequency[0]
            target_read_magazine.append(magazine_frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_magazine, fw)
        target_df[mode+'_magazine'] = target_read_magazine
    
    return target_df

In [None]:
## target_df 'tag' 전처리 (target user가 본 article의 tag 빈도수 저장)
def target_read_tag(target_df, metadata_df, mode, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_tag = pickle.load(fr)
        target_df[mode+'_tag'] = target_read_tag
    
    else:
        print('read_tag generating.. ')
        print('It takes long time.. (Please, download \'read_tag\' pickle file)') 
        
        target_read_tag = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # read_list- > target user의 read_list
            read_list = target_df[target_df['id']==idx][mode].values[0][:]
            
            # 각 target user가 일정 기간 동안 읽은 글의 태그 합쳐서 저장 (중복 허용)
            keyword_list = metadata_df[metadata_df['id'].isin(read_list)]['keyword_list'].tolist()
            read_tag = sum(keyword_list, [])
            
            # 각 target user가 일정 기간 동안 읽은 글의 태그들의 빈도수 저장
            frequency = Counter(read_tag)
            target_read_tag.append(frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_read_tag, fw)
        target_df[mode+'_tag'] = target_read_tag

    return target_df

In [None]:
## target_df 'interest' 전처리 (tag에서 빈도수가 높은 상위 top_N개 관심키워드 저장)
def target_read_interest(target_df, top_N, mode, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_interest = pickle.load(fr)
        target_df[mode+'_interest'] = target_read_interest
    
    else:
        print('read_interest generating.. ')
        print('It takes long time.. (Please, download \'read_interest\' pickle file)') 
        
        target_read_interest = []
        iteration = 0      
        
        # read_tag에서 빈도수가 높은 상위 top_N개의 키워드 저장
        for idx in target_df['id'].values.tolist():
            interest = []
            
            rt = target_df[target_df['id']==idx][mode+'_tag'].values[0]
            sorted_rt = sorted(rt.items(), key=lambda x: x[1], reverse=True)
            
            for i in range(len(sorted_rt[:top_N])):
                interest.append(sorted_rt[:top_N][i][0])
                
            target_read_interest.append(interest)
            
            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_read_interest, fw)
        target_df[mode+'_interest'] = target_read_interest

    return target_df

In [None]:
## target_df 'behavior' 전처리 (target user의 글 소비 성향 저장)
def target_read_behavior(target_df, metadata_df, mode, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            ratio_list = pickle.load(fr)

    else:
        ratio_list = []

        if mode == 'recent':
            # 최근 기간 -> pop: recent_view가 상위 20%인 글 / reg: 2019.02.15 ~ 2019.03.01 동안 발행된 글
            pop_id = metadata_df[metadata_df['recent_view'] > metadata_df['recent_view'].quantile(0.80)]['id'].tolist()
            reg_id = metadata_df[(metadata_df['reg_ts'] >= get_unix_time(20190215)) & (metadata_df['reg_ts'] < get_unix_time(20190301))]['id'].tolist()
        else:
            # 전체 기간 -> pop: view가 상위 20%인 글 / reg: 2018.09.15 ~ 2019.03.01 동안 발행된 글
            pop_id = metadata_df[metadata_df['view'] > metadata_df['view'].quantile(0.80)]['id'].tolist()
            reg_id = metadata_df[(metadata_df['reg_ts'] >= get_unix_time(20180915)) & (metadata_df['reg_ts'] < get_unix_time(20190301))]['id'].tolist()

        iteration = 0

        for idx in target_df['id'].values.tolist():
            read_list = target_df[target_df['id']==idx][mode].values[0][:]

            fr_dic = target_df[target_df['id']==idx][mode+'_following'].values[0]
            mr_dic = target_df[target_df['id']==idx][mode+'_magazine'].values[0]
            pop = pd.Series(read_list)[pd.Series(read_list).isin(pop_id)].tolist()
            reg = pd.Series(read_list)[pd.Series(read_list).isin(reg_id)].tolist()

            # f_ratio -> target user가 본 article 중에서 following의 비율
            # m_ratio -> target user가 본 article 중에서 magazine의 비율
            # p_ratio -> target user가 본 article 중에서 인기가 많은 글('recent_view' or 'view'가 상위 20%)의 비율
            # r_ratio -> target user가 본 article 중에서 최근 발행된 글('reg_ts'가 '190215' or '180915' 이후)의 비율
            f_ratio, m_ratio, p_ratio, r_ratio = 0, 0, 0 ,0

            if len(read_list) >= 1:
                f_ratio = round(sum(fr_dic.values())/len(read_list), 2)
                m_ratio = round(sum(mr_dic.values())/len(read_list), 2)      
                p_ratio = round(len(pop)/len(read_list), 2)
                r_ratio = round(len(reg)/len(read_list), 2)

            ratio_list.append([f_ratio, m_ratio, p_ratio, r_ratio])

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(ratio_list, fw)

    ratio_df = pd.DataFrame(ratio_list, columns=[mode+'_f_ratio', mode+'_m_ratio', mode+'_p_ratio', mode+'_r_ratio'])
    target_df.index = list(range(len(target_df)))
    target_df = pd.concat([target_df, ratio_df], axis=1)
    
    return target_df

In [None]:
# target user에 대한 전처리
def target_users_preprocessing(target_users_list, users_df, metadata_df, read_raw_df, from_dt, to_dt, mode):
    
    target_df = target_df_generator(target_users_list, users_df)

    # target user가 전체 기간 및 일정 기간 동안 읽은 글 저장
    target_df = target_read_article(target_df, read_raw_df, './pickle/'+mode+'_read_article')
    target_df = target_recent_article(target_df, read_raw_df, from_dt, to_dt, './pickle/'+mode+'_recent_article_'+str(from_dt)[2:])

    # target user가 읽은 글의 following 빈도수 저장
    target_df = target_read_following(target_df, 'read', './pickle/'+mode+'_read_following')
    target_df = target_read_following(target_df, 'recent', './pickle/'+mode+'_recent_following_'+str(from_dt)[2:])

    # target user가 읽은 글의 magazine 빈도수 저장
    target_df = target_read_magazine(target_df, metadata_df, 'read', './pickle/'+mode+'_read_magazine')
    target_df = target_read_magazine(target_df, metadata_df, 'recent', './pickle/'+mode+'_recent_magazine_'+str(from_dt)[2:])

    # target user가 읽은 글의 tag 빈도수 저장
    target_df = target_read_tag(target_df, metadata_df, 'read', './pickle/'+mode+'_read_tag')
    target_df = target_read_tag(target_df, metadata_df, 'recent', './pickle/'+mode+'_recent_tag_'+str(from_dt)[2:])

    # target user가 읽은 글의 tag 중 빈도수가 높은 top_N(=6)개의 interest 저장
    target_df = target_read_interest(target_df, 6, 'read', './pickle/'+mode+'_read_interest')
    target_df = target_read_interest(target_df, 6, 'recent', './pickle/'+mode+'_recent_interest_'+str(from_dt)[2:])

    # target user의 글 소비 경향 저장
    target_df = target_read_behavior(target_df, metadata_df, 'read', './pickle/'+mode+'_read_behavior')
    target_df = target_read_behavior(target_df, metadata_df, 'recent', './pickle/'+mode+'_recent_behavior_'+str(from_dt)[2:])
    
    print('target_df preprocessing completed!', '\n')

    return target_df

In [None]:
def data_preprocessing(directory, mode):
    startTime = time.time()

    if mode == 'dev':
        target_users_list, users, metadata, read_raw = data_loading(directory, mode)
    else:
        target_users_list, users, metadata, read_raw = data_loading(directory, mode)

    print('read_raw preprocessing..')
    read_raw = read_raw_preprocessing(read_raw)
    
    print('metadata preprocessing..')
    metadata = metadata_preprocessing(metadata, read_raw, 20190215, 20190228)
    
    print('target_df preprocessing..')
    target_df = target_users_preprocessing(target_users_list, users, metadata, read_raw, 20190215, 20190228, mode)
    
    print('preprocessing comppleted!')
    endTime = time.time() - startTime
    print(int(endTime), 'seconds', '=', int(endTime/60), 'minutes', '\n')
    
    return target_users_list, target_df, metadata, read_raw

In [None]:
# item-based CF를 위한 item-user matrix 생성
def item_user_df_generator(metadata_df, read_raw_df, from_dt, to_dt):

    # users_read -> 일정 기간 동안 users가 읽은 article의 수 (중복 제거)
    partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
    users_read = partial_read[['user_id','article_id']].drop_duplicates().groupby('user_id').count()
    
    # not_cold_start_users(ncsu) -> 일정 기간 동안 읽은 article의 수가 평균보다 높은 users
    not_cold_start_users = users_read[users_read['article_id'] > users_read['article_id'].mean()].index.tolist()

    # not_long_tail_items(nlti) -> 일정 기간 동안 view가 상위 5%인 article
    not_long_tail_items = metadata_df[metadata_df['recent_view'] > metadata_df['recent_view'].quantile(0.95)]['id'].tolist()
    
    # ncsu와 nlti에 대해서 item-user matrix 생성
    iu = read_raw_df[read_raw_df['user_id'].isin(not_cold_start_users) & read_raw_df['article_id'].isin(not_long_tail_items)]
    iu = iu[['user_id','article_id']].drop_duplicates()
    iu['read']=1
    iu_df = iu.pivot(index='article_id', columns='user_id', values='read')
    iu_df = iu_df.fillna(0)
    
    return iu_df

## 3. 추천 관련 함수

In [None]:
# item-based CF 추천
def collaborative_filtering(idx, metadata_df, read_raw_df, r_list, recommended, adjusted, from_dt, to_dt, file):

    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            cf_dic = pickle.load(fr)
    
    else:
        print('cf_dic generating.. ')
        print('It takes long time.. (Please, download \'cf_dic\' pickle file)')
        
        # Step 1 : item-user matrix 생성
        iu_df = item_user_df_generator(metadata_df, read_raw_df, from_dt, to_dt)
        
        # Step 2: item-user martix에서 item에 대해 cosine similarity 구하기
        cosine_array = cosine_similarity(iu_df, iu_df)

        # Step 3: 가장 비슷한 100개의 item의 weighted mean을 이용해 predict
        predicted_array = np.zeros(shape=(len(iu_df.index),len(iu_df.columns)))          
            
        for i in range(len(cosine_array)):
            top_100 = cosine_array[i].argsort()[-101:][::-1]
            top_100 = np.delete(top_100, 0)

            weighted_sum = np.array([0])
            for top_idx in top_100:
                weighted_sum = weighted_sum + (cosine_array[i][top_idx] * iu_df.values[top_idx])
            predicted = weighted_sum / len(top_100)
            predicted_array[i] = predicted

        iu_predicted = iu_df.values*(-99999) + predicted_array

        # Step 4: 각 user에 대해 weighted mean이 높은 상위 100개 article을 저장
        cf_dic = {}
        for i in range(len(iu_predicted.T)):
            cf_dic[iu_df.columns[i]] = iu_df.index[iu_predicted.T[i].argsort()[-100:][::-1]].tolist()

        with open(file,"wb") as fw:
            pickle.dump(cf_dic, fw)
            
        print('cf_dic generated!', '\n')
            
    cf_based_recommend_list = []
    
    if idx in cf_dic.keys():
        already = r_list + recommended
        n_rec = int((100-len(recommended)) * adjusted)
        cf_based_recommend_list = pd.Series(cf_dic[idx])[pd.Series(cf_dic[idx]).isin(already)==False].tolist()[:n_rec]
        
    return cf_based_recommend_list

In [None]:
# 일정 기간 동안 조회수가 높은 인기 글을 추천
def popularity_based_recommend(idx, target_df, metadata_df, r_list, recommended, adjusted):
    
    already = r_list + recommended
    
    # 이미 본 글이나 추천된 글을 제외하고 view 순으로 n_rec만큼 추천
    n_rec = int((100-len(recommended)) * adjusted)
    p_article = metadata_df[metadata_df['id'].isin(already)==False].sort_values(['recent_view'],ascending=[False])
    popularity_based_recommend_list = p_article[:n_rec]['id'].tolist()
    
    return popularity_based_recommend_list

In [None]:
# 전체 기간 동안 조회수가 높은 인기 글을 추천
def popularity_based_recommend2(target_df, metadata_df, rec_list, all_already):
    idx = rec_list[0]
    r_list = target_df[target_df['id']==idx]['read'].values[0][:]
    recommended = rec_list[1:]        

    already = r_list + recommended + all_already
    
    # 이미 본 글이나 추천된 글을 제외하고 view 순으로 n_rec만큼 추천
    n_rec = 100-len(recommended)
    p2_article = metadata_df[metadata_df['id'].isin(already)==False].sort_values(['view'],ascending=[False])
    popularity_based_recommend2_list = p2_article[:n_rec]['id'].tolist()
    
    return popularity_based_recommend2_list

In [None]:
# target user가 최근 또는 전체 기간 동안 읽은 글 중에서 구독작가 글의 비율을 고려하여 추천
def following_based_recommend(idx, target_df, metadata_df, r_list, recommended, mode, adjusted, sort_option):
    
    already = r_list + recommended
    following_based_recommend_list = []
    
    # fr_dic -> {f_id,빈도수}
    if mode == 'recent':
        fr_dic = target_df[target_df['id']==idx]['recent_following'].values[0]
    if mode == 'read':
        fr_dic = target_df[target_df['id']==idx]['read_following'].values[0]
        
    # sorted_fr -> 빈도수 순으로 정렬
    sorted_fr = sorted(fr_dic.items(), key=lambda x: x[1], reverse=True)
    
    for i in range(len(sorted_fr)):
        if sorted_fr[i][1] > 0:
            # n_rec -> 추천할 구독작가의 글의 개수 (빈도수가 높을수록 많이 추천됨)
            n_rec = int( (100-len(recommended)) * adjusted * (sorted_fr[i][1]/sum(fr_dic.values())) )
            fr_article = metadata_df[metadata_df['user_id']==sorted_fr[i][0]]
            # 이미 본 글이나 이미 추천된 글을 제외하고 sort_option 순으로 n_rec만큼 추천
            fr_candidate = fr_article[fr_article['id'].isin(already)==False].sort_values([sort_option],ascending=[False])[:n_rec]['id'].tolist()
            following_based_recommend_list = following_based_recommend_list + fr_candidate
    
    return following_based_recommend_list

In [None]:
# target user가 구독하는 작가의 글을 추천 (읽지 않아서 구독작가의 글을 추천하지 않는 경우에 대비)
def following_based_recommend2(idx, target_df, metadata_df, r_list, recommended, adjusted, sort_option):
    
    already = r_list + recommended
    
    # f_list -> target user의 following_list (구독작가 리스트)
    f_list = target_df[target_df['id']==idx]['following_list'].values[0][:]  
    for i in range(len(f_list)):
        f_list[i] = f_list[i] + '_'

    # 이미 본 글이나 이미 추천된 글을 제외하고 sort_option 순으로 n_rec만큼 추천
    n_rec = int((100-len(recommended)) * adjusted)
    f2_article = metadata_df[metadata_df['id'].str.startswith(tuple(f_list))]
    following_based_recommend2_list = f2_article[f2_article['id'].isin(already)==False].sort_values([sort_option],ascending=[False])[:n_rec]['id'].tolist()
    
    return following_based_recommend2_list

In [None]:
# target user가 최근 또는 전체 기간 동안 읽은 글 중에서 매거진 글의 비율을 고려하여 추천
def magazine_based_recommend(idx, target_df, metadata_df, r_list, recommended, mode, adjusted, sort_option):
    
    already = r_list + recommended
    magazine_based_recommend_list = []
    
    # mr_dic -> {m_id : 빈도수}
    if mode == 'recent':
        mr_dic = target_df[target_df['id']==idx]['recent_magazine'].values[0]
    if mode == 'read':
        mr_dic = target_df[target_df['id']==idx]['read_magazine'].values[0]
    
    # sorted_mr -> 빈도수 순으로 정렬
    sorted_mr = sorted(mr_dic.items(), key=lambda x: x[1], reverse=True)

    for i in range(len(sorted_mr)):
        # n_rec -> 추천할 매거진 글의 개수 (빈도수가 높을수록 많이 추천됨)
        n_rec = int( (100-len(recommended)) * adjusted * (sorted_mr[i][1]/sum(mr_dic.values())) )
        mr_article = metadata_df[metadata_df['magazine_id']==sorted_mr[i][0]]
        # 이미 본 글이나 이미 추천된 글을 제외하고 sort_option 순으로 n_rec만큼 추천
        mr_candidate = mr_article[mr_article['id'].isin(already)==False].sort_values([sort_option],ascending=[False])[:n_rec]['id'].tolist()
        magazine_based_recommend_list = magazine_based_recommend_list + mr_candidate
    
    return magazine_based_recommend_list

In [None]:
# target user가 최근 또는 전체 기간 동안 읽은 글들에서 자주 나오는 태그를 고려하여 추천
def tag_based_recommend(idx, target_df, metadata_df, r_list, recommended, mode, common_num, adjusted, sort_option):
    
    already = r_list + recommended

    # user_interest -> target user의 interest (0 ~ 6개)
    if mode == 'recent':
        user_interest = target_df[target_df['id']== idx]['recent_interest'].values[0][:]
    if mode == 'read':
        user_interest = target_df[target_df['id']== idx]['read_interest'].values[0][:]

    # interest_article_id -> target user의 interest와 common_num개 이상 겹치는 글의 article_id
    interest_article_id = []
    for i in range(len(metadata_df)):
        if len(set(metadata_df['keyword_list'].values[i]) & set(user_interest)) >= common_num:
            interest_article_id.append(metadata_df['id'].values[i])

    # 이미 본 글이나 이미 추천된 글을 제외하고 sort_option 순으로 n_rec만큼 추천
    n_rec = int((100-len(recommended)) * adjusted)
    t_article = metadata_df[metadata_df['id'].isin(interest_article_id)]
    tag_based_recommend_list = t_article[t_article['id'].isin(already)==False].sort_values([sort_option],ascending=[False])[:n_rec]['id'].tolist()

    return tag_based_recommend_list

In [None]:
def hybrid_recommend(idx, target_df, metadata_all, metadata_reg, metadata_pop, metadata_hot, read_raw_df, r_list, recommended, mode):    
    
    f1 = following_based_recommend(idx, target_df, metadata_hot, r_list, recommended, mode, 1, 'recent_view')
    recommended = recommended + f1

    f2 = following_based_recommend(idx, target_df, metadata_reg, r_list, recommended, mode, 0.4, 'reg_ts')
    recommended = recommended + f2
    
    f3 = following_based_recommend2(idx, target_df, metadata_hot, r_list, recommended, 1, 'recent_view')
    recommended = recommended + f3
    
    f4 = following_based_recommend2(idx, target_df, metadata_reg, r_list, recommended, 0.4, 'reg_ts')
    recommended = recommended + f4

    p1 = popularity_based_recommend(idx, target_df, metadata_pop, r_list, recommended, 0.1)
    recommended = recommended + p1
    
    cf = collaborative_filtering(idx, metadata_all, read_raw_df, r_list, recommended, 0.05, 20190215, 20190228, './pickle/cf_dic_190215')
    recommended = recommended + cf
    
    m1 = magazine_based_recommend(idx, target_df, metadata_hot, r_list, recommended, mode, 1, 'recent_view')
    recommended = recommended + m1
    
    m2 = magazine_based_recommend(idx, target_df, metadata_all, r_list, recommended, mode, 0.8, 'reg_ts')
    recommended = recommended + m2

    t = tag_based_recommend(idx, target_df, metadata_hot, r_list, recommended, mode, 2, 0.5, 'recent_view')
    recommended = recommended + t

    print('mode:', mode , '\t'*2, \
          'f1:'+str(len(f1)).rjust(2)+',', 'f2:'+str(len(f2)).rjust(2)+',', 'f3:'+str(len(f3)).rjust(2)+',', 'f4:'+str(len(f4)).rjust(2), '\t', \
          'p1:'+str(len(p1)).rjust(2)+',', 'cf:'+str(len(cf)).rjust(2)+',', 'm1:'+str(len(m1)).rjust(2)+',', 'm2:'+str(len(m2)).rjust(2), '\t', \
          't:'+str(len(t)).rjust(2)+',', 'p2:'+str(100-len(recommended)).rjust(2))

    return recommended

In [None]:
def recommender(target_list, target_df, metadata_df, read_raw_df, output_file):
    
    # 추천 실행 시간 측정
    startTime = time.time()
    
    # 최종 추천 리스트
    recommend_list = []
    
    # 적어도 한 번 이상 추천된 글 리스트
    all_already = []
    
    # metadata_all -> 추천 기간 이후에 발행된 글을 제외한 metadata (629,252개)
    metadata_all = metadata_df[metadata_df['reg_ts'] < get_unix_time(20190314)]
    # metadata_reg -> 추천 기간을 포함한 최근 6개월 동안 발행된 글의 metadata (127,218개)
    metadata_reg = metadata_df[(metadata_df['reg_ts'] >= get_unix_time(20180915)) & (metadata_df['reg_ts'] < get_unix_time(20190314))]
    # metadata_pop -> 최근 view가 상위 20%인 글의 metadata (126,666개)
    metadata_pop = metadata_df[metadata_df['recent_view'] > metadata_df['recent_view'].quantile(0.80)]
    # metadata_hot -> 추천 기간 동안 발행되었고, 최근 view가 상위 20%인 글의 metadata (5,011개)
    metadata_hot = metadata_df[(metadata_df['recent_view'] > metadata_df['recent_view'].quantile(0.80)) & ((metadata_df['reg_ts'] >= get_unix_time(20190222)))]
 
    
    # recent_min -> 최소 recent 건수
    n_recent = []
    for i in range(len(target_df)):
        n_recent.append(len(target_df['recent'].values[i]))
    recent_min = np.percentile(np.array(n_recent), 20)
    
    # 진행 상황
    iteration = 0
    
    for idx in target_list:
        recommended = []

        r_list = target_df[target_df['id']==idx]['read'].values[0][:]
        recent = target_df[target_df['id']==idx]['recent'].values[0][:]
        
        if len(recent) > recent_min:
            print('read:'+str(len(r_list))+',', 'recent:'+str(len(recent)))
            recommended = hybrid_recommend(idx, target_df, metadata_all, metadata_reg, metadata_pop, metadata_hot, \
                                           read_raw_df, r_list, recommended, 'recent')
                
        else:
            print('read:'+str(len(r_list))+',', 'recent:'+str(len(recent)))
            recommended = hybrid_recommend(idx, target_df, metadata_all, metadata_reg, metadata_pop, metadata_hot, \
                                           read_raw_df, r_list, recommended, 'read')

        
        # 적어도 한 번 이상 추천된 글 리스트에 저장
        all_already = list(set(all_already) | set(recommended))
        
        # 추천 리스트 맨 앞에 user_id 추가
        recommended.insert(0, idx)                
        recommend_list.append(recommended)        

        # 진행 상황 표시
        iteration += 1
        print('Total:'+str(len(recommended)-1), '\t'*13, str(iteration).rjust(4), '/', str(len(target_list)), 'completed', '\n')

    # 100개 되지 않았다면 popularity_based 추천
    for i in tqdm(range(len(recommend_list))):
        if len(recommend_list[i]) <= 100:
            p2 = popularity_based_recommend2(target_df, metadata_df, recommend_list[i], all_already)
            recommend_list[i] = recommend_list[i] + p2
            all_already = list(set(all_already) | set(p2))


    # 추천 리스트를 파일로 저장
    f = open(output_file, 'w')
    for i in range(len(recommend_list)):
        for j in range(len(recommend_list[i])):
            f.write(recommend_list[i][j])
            if j == (len(recommend_list[i]) - 1):
                continue
            f.write(' ')
        f.write('\n')
    f.close()
    print('recommend.txt file saved..')
    print('completed!')
    
    endTime = time.time() - startTime
    print(int(endTime), 'seconds', '=', int(endTime/60), 'minutes')

    return recommend_list

## 4. 메인

### 1) dev용

In [None]:
# # step 1. 전처리
# dev_users, dev, metadata, read_raw = data_preprocessing('./res/', 'dev')
# # step 2. 추천
# recommend = recommender(dev_users, dev, metadata, read_raw, './recommend.txt')

### 2) test용

In [None]:
# step 1. 전처리
test_users, test, metadata, read_raw = data_preprocessing('./res/', 'test')
# step 2. 추천
recommend = recommender(test_users, test, metadata, read_raw, './recommend.txt')

## 5. 추천 결과 확인

In [None]:
# 추천 결과 확인
def recommend_result(recommend, idx_num, target_df, metadata_df):
    
    user_id = recommend[idx_num][0]
    article_id = recommend[idx_num][1:]
    
    metadata_rec = metadata_df[metadata_df['id'].isin(article_id)]
    f_list = target_df[target_df['id']==user_id]['following_list'].values[0][:]
    for i in range(len(f_list)):
        f_list[i] = f_list[i] + '_'
        
    rec_fr = round(len(metadata_rec[metadata_rec['id'].str.startswith(tuple(f_list))])/100, 2)
    rec_mr = round(len(metadata_rec[metadata_rec['magazine_id']!=0])/100, 2)
    rec_pr = round(len(metadata_rec[metadata_rec['recent_view'] > metadata_df['recent_view'].quantile(0.80)])/100, 2)
    rec_rr = round(len(metadata_rec[metadata_rec['reg_ts'] >= get_unix_time(20190222)])/100, 2)
    
    print('1. user_id:', '\t'*2, target_df[target_df['id']==user_id]['id'].values[0])
    print('2. read_num:', '\t'*2, len(target_df[target_df['id']==user_id]['read'].values[0]))
    print('3. recent_num:', '\t'*2, len(target_df[target_df['id']==user_id]['recent'].values[0]))
    print('4. read_interest:', '\t', target_df[target_df['id']==user_id]['read_interest'].values[0])
    print('5. recent_interest:', '\t', target_df[target_df['id']==user_id]['recent_interest'].values[0])
    print('6. read_ratio:', '\t'*2, ['fr', 'mr', 'pr', 'rr'], '=', target_df[target_df['id']==user_id][['read_f_ratio', 'read_m_ratio', 'read_p_ratio', 'read_r_ratio']].values[0].tolist())
    print('7. recent_ratio:', '\t', ['fr', 'mr', 'pr', 'rr'], '=', target_df[target_df['id']==user_id][['recent_f_ratio', 'recent_m_ratio', 'recent_p_ratio', 'recent_r_ratio']].values[0].tolist())
    print('8. recommend_ratio', '\t', ['fr', 'mr', 'pr', 'rr'], '=', [rec_fr, rec_mr, rec_pr, rec_rr])

    return(metadata_rec[['title','keyword_list','recent_view','display_url']][:30])

In [None]:
# target user의 article 소비 경향 및 추천 글 top 30개 출력
idx_num = 0   # 0~4999 입력 (dev의 경우, 0 ~ 2999)
recommend_result(recommend, idx_num, test, metadata)