## Kakao arena 2nd Competition
# "브런치 사용자를 위한 글 추천 대회"
### brunch 데이터를 활용해 사용자의 취향에 맞는 글을 예측하는 대회
* 공식 홈페이지: https://arena.kakao.com/c/2
* 베이스 코드: https://github.com/kakao-arena/brunch-article-recommendation

### BrunchRec 
* designed by **datartist**
* 깃헙 주소: https://github.com/jihoo-kim/BrunchRec  

## 1. 라이브러리 및 원본데이터

In [1]:
## 라이브러리
import numpy as np
import pandas as pd
import os
import time
import glob
import pickle
import warnings
from itertools import chain
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

directory = './res/'
warnings.filterwarnings(action='ignore')

In [2]:
## 원본데이터

# users  // DataFrame (310758, 3)
users = pd.read_json(directory + '/users.json', lines=True)

# magazine  // DataFrame (27967, 2)
magazine = pd.read_json(directory + 'magazine.json', lines=True)

# metadata  // DataFrame (643104, 9)
metadata = pd.read_json(directory + 'metadata.json', lines=True)

# dev.users  // List (3000)
f = open('./res/predict/dev.users')
dev_users = f.read().splitlines()
f.close()

# test.users  // List (5000)
f = open('./res/predict/test.users')
test_users = f.read().splitlines()
f.close()

# read  // DataFrame (3507097, 5)
read_file_lst = glob.glob('./res/read/*')
exclude_file_lst = ['read.tar']
read_df_lst = []
for f in read_file_lst:
    file_name = os.path.basename(f)
    if file_name in exclude_file_lst:
        print(file_name)
    else:
        df_temp = pd.read_csv(f, header=None, names=['raw'])
        df_temp['dt'] = file_name[:8]
        df_temp['hr'] = file_name[8:10]
        df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
        df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
        read_df_lst.append(df_temp)
read = pd.concat(read_df_lst)

# read_raw  // DataFrame (22110706, 4)
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))

read_cnt_by_user = read['article_id'].str.split(' ').map(len)

read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                         'hr': np.repeat(read['hr'], read_cnt_by_user),
                         'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                         'article_id': chainer(read['article_id'])})

## 2. 전처리 관련 함수

In [3]:
## read_raw 'dt' 전처리 (날짜 컬럼의 자료형 변환 string -> int)
def read_raw_dt_preprocessing(read_raw_df):
    
    if type(read_raw_df['dt'].values[0]) == type('string'):
        dt = read_raw_df['dt'].tolist()
        read_raw_df['dt'] = [int (i) for i in dt]
        print('preprocessing completed!')

    else:
        print('already preprocessed!')
        
    return read_raw_df

In [4]:
## metadata 'view' 전처리 (전체기간 동안의 조회수 view 저장)
def metadata_view_preprocessing(metadata_df, read_raw_df):
    
    if 'view' in metadata_df.keys():
        print('already preprocessed!')
        
    else:
        view = read_raw_df.groupby('article_id').count()['user_id']
        view_df = pd.DataFrame({'id':view.index, 'view':view.values})
        metadata_df = pd.merge(metadata_df, view_df, how='left', on='id')
        metadata_df['view'] = metadata_df['view'].fillna(0)
        print("preprocessing completed!")

    return metadata_df

In [5]:
## target_df 생성 (users에 없는 user_id 추가)
def target_df_generator(target_users_list, users_df):
    
    target_df = users_df[users_df['id'].isin(target_users_list)]
    
    for target_user in target_users_list:
        if (target_user in target_df['id'].tolist()) == False:
            new_df = pd.DataFrame({'following_list':[[]], 'id':[target_user], 'keyword_list':[[]]})
            target_df = target_df.append(new_df)
    print("preprocessing completed!")
    
    return target_df

In [6]:
## target_df 'read' 전처리 (전체 기간 동안 target user가 본 article을 저장)
def target_read_article(target_df, read_raw_df, file):
    
    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            target_read_article = pickle.load(fr)
        print("target_read_article file loaded..")

        target_df['read'] = target_read_article
        print("preprocessing completed!")
    
    else:
        target_read_article = []
        iteration = 0
        
        # target user가 전체 기간 동안 본 글의 article_id 저장 (중복 허용)
        for idx in target_df['id'].values.tolist():
            read_list = read_raw_df[read_raw_df['user_id']==idx]['article_id'].values.tolist()
            target_read_article.append(read_list)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_article, fw)
        print("target_read_article file saved..")
        
        target_df['read'] = target_read_article
        print("preprocessing completed!")
            
    return target_df

In [7]:
# target_df 'following' 전처리 (전체 기간 동안 target user가 본 following의 빈도수 저장)
def target_read_following(target_df, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_following = pickle.load(fr)
        print("target_read_following file loaded..")

        target_df['read_following'] = target_read_following
        print("preprocessing completed!")
    
    else:
        target_read_following = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # r_list- > target user의 read_list (읽은 글 리스트)
            r_list = target_df[target_df['id']==idx]['read'].values[0][:]
            r_series = pd.Series(r_list)

            # f_list -> target user의 following_list (구독작가 리스트)
            f_list = target_df[target_df['id']==idx]['following_list'].values[0][:]  
            for i in range(len(f_list)):
                f_list[i] = f_list[i] + '_'

            # following_frequency -> r_list 중에서 해당 작가(f_id)의 글의 빈도수
            following_frequency = {}
            for f_id in f_list:
                frequency = len(r_series[r_series.str.startswith(f_id)].tolist())
                following_frequency[f_id[:-1]]=frequency
            target_read_following.append(following_frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_following, fw)
        print("target_read_following file saved..")
        
        target_df['read_following'] = target_read_following
        print("preprocessing completed!")
    
    return target_df

In [8]:
# target_df 'magazine' 전처리 (전체 기간 동안 target user가 본 magazine의 빈도수 저장)
def target_read_magazine(target_df, metadata_df, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_magazine = pickle.load(fr)
        print("target_read_magazine file loaded..")

        target_df['read_magazine'] = target_read_magazine
        print("preprocessing completed!")
    
    else:
        target_read_magazine = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # target user가 읽은 글의 magazine_id 저장
            magazine_list = []
            r_list = target_df[target_df['id']==idx]['read'].values[0][:]
            magazine_list = metadata_df[metadata_df['id'].isin(r_list)]['magazine_id'].tolist()
            
            # magazine_id 빈도수 저장 (magazine이 아닌 0은 제외)
            magazine_frequency = Counter(magazine_list)
            del magazine_frequency[0]
            target_read_magazine.append(magazine_frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_magazine, fw)
        print("target_read_magazine file saved..")
        
        target_df['read_magazine'] = target_read_magazine
        print("preprocessing completed!")
    
    return target_df

In [9]:
## target_df 'tag' 전처리 (일정 기간 동안 target user가 본 article의 tag 빈도수 저장)
def target_read_tag(target_df, metadata_df, read_raw_df, from_dt, to_dt, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_read_tag = pickle.load(fr)
        print("target_read_tag file loaded..")

        target_df['read_tag'] = target_read_tag
        print("preprocessing completed!")
    
    else:
        target_read_tag = []
        iteration = 0
        
        # 일정 기간(from_dt ~ to_dt) 동안 target user가 읽은 내역
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]

        for idx in target_df['id'].values.tolist():
            # 각 target user가 일정 기간 동안 읽은 article_id 리스트
            partial_read_list = partial_read[partial_read['user_id']==idx]['article_id'].values.tolist()

            # 각 target user가 일정 기간 동안 읽은 글의 태그 합쳐서 저장 (중복 허용)
            partial_read_tag = []
            
            for i in range(len(partial_read_list)):
                if len(metadata_df[metadata_df['id']==partial_read_list[i]]) > 0:
                    tag = metadata_df[metadata_df['id']==partial_read_list[i]]['keyword_list'].values[0][:]
                    partial_read_tag = partial_read_tag + tag
            
            # 각 target user가 일정 기간 동안 읽은 글의 태그들의 빈도수 저장
            frequency = Counter(partial_read_tag)
            target_read_tag.append(frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_read_tag, fw)
        print("target_read_tag file saved..")
        
        target_df['read_tag'] = target_read_tag
        print("preprocessing completed!")        

    return target_df

In [10]:
## target_df 'interest' 전처리 (read_tag에서 빈도수가 높은 상위 top_N개 관심키워드 저장)
def target_interest(target_df, top_N, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_interest = pickle.load(fr)
        print("target_interest file loaded..")

        target_df['interest'] = target_interest
        print("preprocessing completed!")
    
    else:
        target_interest = []
        iteration = 0      
        
        # read_tag에서 빈도수가 높은 상위 top_N개의 키워드 저장
        for idx in target_df['id'].values.tolist():
            interest = []
            
            rt = target_df[target_df['id']==idx]['read_tag'].values[0]
            sorted_rt = sorted(rt.items(), key=lambda x: x[1], reverse=True)
            
            for i in range(len(sorted_rt[:top_N])):
                interest.append(sorted_rt[:top_N][i][0])
                
            target_interest.append(interest)
            
            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_interest, fw)
        print("target_interest file saved..")
        
        target_df['interest'] = target_interest
        print("preprocessing completed!")        

    return target_df

In [11]:
def target_behavior(target_df):
    f_ratio_list = []
    m_ratio_list = []

    for idx in target_df['id'].values.tolist():
        r_list = target_df[target_df['id']==idx]['read'].values[0][:]

        fr_dic = target_df[target_df['id']==idx]['read_following'].values[0]
        mr_dic = target_df[target_df['id']==idx]['read_magazine'].values[0]

        f_ratio = sum(fr_dic.values()) / len(r_list)
        m_ratio = sum(mr_dic.values()) / len(r_list)      

        f_ratio_list.append(round(f_ratio,2))
        m_ratio_list.append(round(m_ratio,2))

    target_df['f_ratio'] = f_ratio_list
    target_df['m_ratio'] = m_ratio_list
    print("preprocessing completed!")
    
    return target_df

## 3. 추천 관련 함수

In [12]:
# item-based CF를 위한 item-user matrix 생성
def item_user_df_generator(metadata_df, read_raw_df):

    # users_read -> 전체 기간 동안 users가 읽은 article의 수 (중복 제거)
    users_read = read_raw_df[['user_id','article_id']].drop_duplicates().groupby('user_id').count()
    
    # not_cold_start_users(ncsu) -> 읽은 article의 수가 평균보다 높은 users (평균 41개, 306,222 -> 55,416명)
    not_cold_start_users = users_read[users_read['article_id'] > users_read['article_id'].mean()].index.tolist()

    # not_long_tail_items(nlti) -> view가 상위 1%인 article (상위 1% 491건, 643,104 -> 6,424개)
    not_long_tail_items = metadata_df[metadata_df['view'] > metadata_df['view'].quantile(0.99)]['id'].tolist()
    
    # ncsu와 nlti에 대해서 item-user matrix 생성 (6,424 * 55,416)
    iu = read_raw_df[read_raw_df['user_id'].isin(not_cold_start_users) & read_raw_df['article_id'].isin(not_long_tail_items)]
    iu = iu[['user_id','article_id']].drop_duplicates()
    iu['read']=1
    iu_df = iu.pivot(index='article_id', columns='user_id', values='read')
    iu_df = iu_df.fillna(0)
    
    return iu_df

In [13]:
# item-based CF 추천
def collaborative_filtering(iu_df, file):

    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            cf_dic = pickle.load(fr)
    
    else:
        # Step1: item-user martix에서 item에 대해 cosine similarity 구하기
        cosine_array = cosine_similarity(iu_df, iu_df)

        # Step 2: 가장 비슷한 10개의 item의 weighted mean을 이용해 predict
        predicted_array = np.zeros(shape=(len(iu_df.index),len(iu_df.columns)))         
            
        for i in range(len(cosine_array)):
            top_10 = cosine_array[i].argsort()[-11:][::-1]
            top_10 = np.delete(top_10, 0)

            weighted_sum = np.array([0])
            for top_idx in top_10:
                weighted_sum = weighted_sum + (cosine_array[i][top_idx] * iu_df.values[top_idx])
            predicted = weighted_sum / len(top_10)
            predicted_array[i] = predicted

        iu_predicted = iu_df.values*(-99999) + predicted_array

        # Step 3: 각 user에 대해 weighted mean이 높은 상위 100개 article을 저장
        cf_dic = {}
        for i in range(len(iu_predicted.T)):
            cf_dic[iu_df.columns[i]] = iu_df.index[iu_predicted.T[i].argsort()[-100:][::-1]].tolist()

        with open(file,"wb") as fw:
            pickle.dump(cf_dic, fw)
        
    return cf_dic

In [14]:
# target user가 읽은 글 중에서 구독작가 글의 비율을 고려하여 추천
def following_based_recommend(idx, target_df, metadata_df, r_list, recommended, f_ratio):
    
    already = r_list + recommended
    following_based_recommend_list = []
    
    # fr_dic -> {f_id,빈도수}
    fr_dic = target_df[target_df['id']==idx]['read_following'].values[0]
        
    # sorted_fr -> 빈도수 순으로 정렬
    sorted_fr = sorted(fr_dic.items(), key=lambda x: x[1], reverse=True)
    
    for i in range(len(sorted_fr)):
        if sorted_fr[i][1] > 0:
            # n_rec -> 추천할 구독작가의 글의 개수 (빈도수가 높을수록 많이 추천됨)
            n_rec = int( (100-len(recommended)) * f_ratio * (sorted_fr[i][1]/sum(fr_dic.values())) )
            fr_article = metadata_df[metadata_df['user_id']==sorted_fr[i][0]]
            # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
            fr_candidate = fr_article[fr_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()
            following_based_recommend_list = following_based_recommend_list + fr_candidate
    
    return following_based_recommend_list

In [15]:
# target user가 읽은 글 중에서 매거진 글의 비율을 고려하여 추천
def magazine_based_recommend(idx, target_df, metadata_df, r_list, recommended, m_ratio):
    
    already = r_list + recommended
    magazine_based_recommend_list = []
    
    # mr_dic -> {m_id : 빈도수}
    mr_dic = target_df[target_df['id']==idx]['read_magazine'].values[0]
    
    # sorted_mr -> 빈도수 순으로 정렬
    sorted_mr = sorted(mr_dic.items(), key=lambda x: x[1], reverse=True)

    for i in range(len(sorted_mr)):
        # n_rec -> 추천할 매거진 글의 개수 (빈도수가 높을수록 많이 추천됨)
        n_rec = int( (100-len(recommended)) * (sorted_mr[i][1]/sum(mr_dic.values())) )
        mr_article = metadata_df[metadata_df['magazine_id']==sorted_mr[i][0]]
        # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
        mr_candidate = mr_article[mr_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()
        magazine_based_recommend_list = magazine_based_recommend_list + mr_candidate
    
    return magazine_based_recommend_list

In [16]:
# target user가 읽은 글들에서 자주 나오는 태그를 고려하여 추천
def tag_based_recommend(idx, target_df, metadata_df, r_list, recommended):
    
    already = r_list + recommended

    # user_interest -> target user의 interest (0 ~ 6개)
    user_interest = target_df[target_df['id']== idx]['interest'].values[0][:]
    
    # interest_article_id -> target user의 interest와 2개 이상 겹치는 글의 article_id
    interest_article_id = []
    for i in range(len(metadata_df)):
        if len(set(metadata_df['keyword_list'].values[i]) & set(user_interest)) >= 2:
            interest_article_id.append(metadata['id'][i])

    # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
    n_rec = 100-len(recommended)
    t_article = metadata_df[metadata_df['id'].isin(interest_article_id)]
    tag_based_recommend_list = t_article[t_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()

    return tag_based_recommend_list

In [17]:
# 일정 기간 동안 조회수가 높은 인기 글을 추천
def popularity_based_recommend(read_raw_df, r_list, recommended, from_dt, to_dt, file):
    
    already = r_list + recommended
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            popularity = pickle.load(fr)
            
    else:
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
        available = partial_read[partial_read['article_id'].str.startswith('@')]
        view = available.groupby('article_id').count()
        popularity = view.sort_values(['user_id'], ascending=[False])['user_id']
        popularity = popularity[popularity.keys().isin(metadata['id'].tolist())]
        
        # popularity 파일로 저장
        with open(file,"wb") as fw:
            pickle.dump(popularity, fw)
        print("popularity file saved!")
    
    candidate = popularity[popularity.keys().isin(already)==False]
    popularity_based_recommend_list = candidate[:(100-len(recommended))].keys().tolist()

    return popularity_based_recommend_list

In [18]:
def recommender(target_list, target_df, metadata_df, read_raw_df, min_view, pop_from, pop_to, pop_file, output_file):
    
    startTime = time.time()
    
    recommend_list = []
    iteration = 0
    
    # metadata_min -> min_view보다 많은 view를 가진 글의 metadata
    metadata_min = metadata_df[metadata_df['view'] > min_view]
    
    iu_df = item_user_df_generator(metadata_df, read_raw_df)
    cf_dic = collaborative_filtering(iu_df, './pickle/cf_dic')

    for idx in target_list:
        recommended = []

        r_list = target_df[target_df['id']==idx]['read'].values[0][:]
        f_ratio = target_df[target_df['id']==idx]['f_ratio'].values[0]
        m_ratio = target_df[target_df['id']==idx]['m_ratio'].values[0]
        
        # collaborative filtering 추천 (10개)
        cf = []
        if idx in cf_dic.keys():
            cf = cf_dic[idx][:10]
            recommended = recommended + cf

        # following_based 추천
        f = following_based_recommend(idx, target_df, metadata_min, r_list, recommended, f_ratio)
        recommended = recommended + f

        # magazine_based 추천
        m = magazine_based_recommend(idx, target_df, metadata_min, r_list, recommended, m_ratio)
        recommended = recommended + m

        # tag_bsaed 추천
        t = tag_based_recommend(idx, target_df, metadata_df, r_list, recommended)
        recommended = recommended + t

        # 100개 되지 않았다면 popularity_based 추천
        p = []
        if len(recommended) < 100:
            p = popularity_based_recommend(read_raw_df, r_list, recommended, pop_from, pop_to, pop_file)
            recommended = recommended + p
            
        # 추천 리스트 맨 앞에 user_id 추가
        recommended.insert(0, idx)                
        recommend_list.append(recommended)        


        # 진행 상황 표시
        iteration += 1
        print(str(iteration).rjust(4), '/', str(len(target_list)), 'completed', '\t', 'r_list:'+str(len(r_list)).rjust(5), '\t', 'f_ratio:'+str(int(f_ratio*100)).rjust(3)+'%,', 'm_ratio:'+str(int(m_ratio*100)).rjust(3)+'%', '\t', 'cf:'+str(len(cf)).rjust(2)+',' ,'f:'+str(len(f)).rjust(3)+',', 'm:'+str(len(m)).rjust(3)+',', 't:'+str(len(t)).rjust(3)+',', 'p:'+str(len(p)).rjust(3), '\t','total:'+str(len(recommended)-1))


    # 추천 리스트를 파일로 저장
    f = open(output_file, 'w')
    for i in range(len(recommend_list)):
        for j in range(len(recommend_list[i])):
            f.write(recommend_list[i][j])
            if j == (len(recommend_list[i]) - 1):
                continue
            f.write(' ')
        f.write('\n')
    f.close()
    print('recommend.txt file saved..')
    print('completed!')
    
    endTime = time.time() - startTime
    print(int(endTime), 'seconds', '=', int(endTime/60), 'minutes')

    return recommend_list

## 4. 메인

### Step 1. metadata & read_raw 전처리

In [19]:
read_raw = read_raw_dt_preprocessing(read_raw)
metadata = metadata_view_preprocessing(metadata, read_raw)

preprocessing completed!
preprocessing completed!


### Step 2. target user 전처리

In [20]:
# dev에 대해 추천할 경우
dev = target_df_generator(dev_users, users)
dev = target_read_article(dev, read_raw, './pickle/dev_read_article')
dev = target_read_following(dev, './pickle/dev_read_following')
dev = target_read_magazine(dev, metadata, './pickle/dev_read_magazine')
dev = target_read_tag(dev, metadata, read_raw, 20190222, 20190228, './pickle/dev_read_tag')
dev = target_interest(dev, 6, './pickle/dev_interest_6')
dev = target_behavior(dev)

preprocessing completed!
target_read_article file loaded..
preprocessing completed!
target_read_following file loaded..
preprocessing completed!
target_read_magazine file loaded..
preprocessing completed!
target_read_tag file loaded..
preprocessing completed!
target_interest file loaded..
preprocessing completed!
preprocessing completed!


In [None]:
# # test에 대해 추천할 경우
# test = target_df_generator(test_users, users)
# test = target_read_article(test, read_raw, file='./pickle/test_read_article')
# test = target_read_following(test, './pickle/test_read_following')
# test = target_read_magazine(test, metadata, './pickle/test_read_magazine')
# test = target_read_tag(test, metadata, read_raw, 20190222, 20190228, './pickle/test_read_tag')
# test = target_interest(test, 6, './pickle/test_interest_6')
# test = target_behavior(test)

### Step 3. target user 추천

In [None]:
# dev에 대해 추천할 경우
recommend = recommender(dev_users, dev, metadata, read_raw, 1, 20190222, 20190228, './pickle/popularity_190222_190228', './recommend.txt')

In [None]:
# # test에 대해 추천할 경우
# recommend = recommender(test_users, test, metadata, read_raw, 0, 20190222, 20190228, './pickle/popularity_190222_190228', './recommend.txt')