## Kakao arena 2nd Competition
# "브런치 사용자를 위한 글 추천 대회"
### brunch 데이터를 활용해 사용자의 취향에 맞는 글을 예측하는 대회
* 공식 홈페이지: https://arena.kakao.com/c/2
* 베이스 코드: https://github.com/kakao-arena/brunch-article-recommendation

### BrunchRec 
* designed by **datartist**
* 깃헙 주소: https://github.com/jihoo-kim/BrunchRec  

## 1. 라이브러리 및 원본데이터

In [None]:
## 라이브러리
import numpy as np
import pandas as pd
import os
import time
import glob
import pickle
import datetime
import warnings
from itertools import chain
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

directory = './res/'
warnings.filterwarnings(action='ignore')

In [None]:
## 원본데이터

# users  // DataFrame (310758, 3)
users = pd.read_json(directory + '/users.json', lines=True)

# magazine  // DataFrame (27967, 2)
magazine = pd.read_json(directory + 'magazine.json', lines=True)

# metadata  // DataFrame (643104, 9)
metadata = pd.read_json(directory + 'metadata.json', lines=True)

# dev.users  // List (3000)
f = open('./res/predict/dev.users')
dev_users = f.read().splitlines()
f.close()

# test.users  // List (5000)
f = open('./res/predict/test.users')
test_users = f.read().splitlines()
f.close()

# read  // DataFrame (3507097, 5)
read_file_lst = glob.glob('./res/read/*')
exclude_file_lst = ['read.tar']
read_df_lst = []
for f in read_file_lst:
    file_name = os.path.basename(f)
    if file_name in exclude_file_lst:
        print(file_name)
    else:
        df_temp = pd.read_csv(f, header=None, names=['raw'])
        df_temp['dt'] = file_name[:8]
        df_temp['hr'] = file_name[8:10]
        df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
        df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
        read_df_lst.append(df_temp)
read = pd.concat(read_df_lst)

# read_raw  // DataFrame (22110706, 4)
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))

read_cnt_by_user = read['article_id'].str.split(' ').map(len)

read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                         'hr': np.repeat(read['hr'], read_cnt_by_user),
                         'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                         'article_id': chainer(read['article_id'])})

## 2. 전처리 관련 함수

In [None]:
## read_raw 'dt' 전처리 (날짜 컬럼의 자료형 변환 string -> int)
def read_raw_dt_preprocessing(read_raw_df):
    
    if type(read_raw_df['dt'].values[0]) == type('string'):
        dt = read_raw_df['dt'].tolist()
        read_raw_df['dt'] = [int (i) for i in dt]
        print('preprocessing completed!')

    else:
        print('already preprocessed!')
        
    return read_raw_df

In [None]:
## metadata 'view' 전처리 (일정 기간 동안의 조회수 view 저장)
def metadata_view_preprocessing(metadata_df, read_raw_df, from_dt, to_dt):
    
    if 'view' in metadata_df.keys():
        print('already preprocessed!')
        
    else:
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
        view = partial_read.groupby('article_id').count()['user_id']
        view_df = pd.DataFrame({'id':view.index, 'view':view.values})
        metadata_df = pd.merge(metadata_df, view_df, how='left', on='id')
        metadata_df['view'] = metadata_df['view'].fillna(0)
        print("preprocessing completed!")

    return metadata_df

In [None]:
## target_df 생성 (users에 없는 user_id 추가)
def target_df_generator(target_users_list, users_df):
    
    target_df = users_df[users_df['id'].isin(target_users_list)]
    
    for target_user in target_users_list:
        if (target_user in target_df['id'].tolist()) == False:
            new_df = pd.DataFrame({'following_list':[[]], 'id':[target_user], 'keyword_list':[[]]})
            target_df = target_df.append(new_df)
    print("preprocessing completed!")
    
    return target_df

In [None]:
## target_df 'read' 전처리 (전체 기간 동안 target user가 본 article을 저장)
def target_read_article(target_df, read_raw_df, file):
    
    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            target_read_article = pickle.load(fr)
        print("target_read_article file loaded..")

        target_df['read'] = target_read_article
        print("preprocessing completed!")
    
    else:
        target_read_article = []
        iteration = 0
        
        # target user가 전체 기간 동안 본 글의 article_id 저장 (중복 허용)
        for idx in target_df['id'].values.tolist():
            read_list = read_raw_df[read_raw_df['user_id']==idx]
            read_list = read_list[read_list['article_id'].str.startswith('@')]['article_id'].tolist()
            target_read_article.append(read_list)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_read_article, fw)
        print("target_read_article file saved..")
        
        target_df['read'] = target_read_article
        print("preprocessing completed!")
            
    return target_df

In [None]:
## target_df 'recent' 전처리 (일정 기간 동안 target user가 본 article을 저장)
def target_recent_article(target_df, read_raw_df, from_dt, to_dt, file):
    
    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            target_recent_article = pickle.load(fr)
        print("target_recent_article file loaded..")

        target_df['recent'] = target_recent_article
        print("preprocessing completed!")
    
    else:
        target_recent_article = []
        iteration = 0
        
        # target user가 일정 기간 동안 본 글의 article_id 저장 (중복 허용)
        partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
        
        for idx in target_df['id'].values.tolist():    
            read_list = partial_read[partial_read['user_id']==idx]
            read_list = read_list[read_list['article_id'].str.startswith('@')]['article_id'].tolist()
            target_recent_article.append(read_list)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_recent_article, fw)
        print("target_recent_article file saved..")
        
        target_df['recent'] = target_recent_article
        print("preprocessing completed!")
            
    return target_df

In [None]:
# target_df 'following' 전처리 (일정 기간 동안 target user가 본 following의 빈도수 저장)
def target_recent_following(target_df, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_recent_following = pickle.load(fr)
        print("target_recent_following file loaded..")

        target_df['recent_following'] = target_recent_following
        print("preprocessing completed!")
    
    else:
        target_recent_following = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # recent_list- > target user의 recent_list (일정 기간 동안 읽은 글 리스트)
            recent_list = target_df[target_df['id']==idx]['recent'].values[0][:]
            recent_series = pd.Series(recent_list)

            # f_list -> target user의 following_list (구독작가 리스트)
            f_list = target_df[target_df['id']==idx]['following_list'].values[0][:]  
            for i in range(len(f_list)):
                f_list[i] = f_list[i] + '_'

            # following_frequency -> recent_list 중에서 해당 작가(f_id)의 글의 빈도수
            following_frequency = {}
            for f_id in f_list:
                frequency = len(recent_series[recent_series.str.startswith(f_id)].tolist())
                if frequency > 0:
                    following_frequency[f_id[:-1]]=frequency
            target_recent_following.append(following_frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_recent_following, fw)
        print("target_recent_following file saved..")
        
        target_df['recent_following'] = target_recent_following
        print("preprocessing completed!")
    
    return target_df

In [None]:
# target_df 'magazine' 전처리 (일정 기간 동안 target user가 본 magazine의 빈도수 저장)
def target_recent_magazine(target_df, metadata_df, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_recent_magazine = pickle.load(fr)
        print("target_recent_magazine file loaded..")

        target_df['recent_magazine'] = target_recent_magazine
        print("preprocessing completed!")
    
    else:
        target_recent_magazine = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # target user가 읽은 글의 magazine_id 저장
            magazine_list = []
            recent_list = target_df[target_df['id']==idx]['recent'].values[0][:]
            magazine_list = metadata_df[metadata_df['id'].isin(recent_list)]['magazine_id'].tolist()
            
            # magazine_id 빈도수 저장 (magazine이 아닌 0은 제외)
            magazine_frequency = Counter(magazine_list)
            del magazine_frequency[0]
            target_recent_magazine.append(magazine_frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
            
        with open(file,"wb") as fw:
            pickle.dump(target_recent_magazine, fw)
        print("target_recent_magazine file saved..")
        
        target_df['recent_magazine'] = target_recent_magazine
        print("preprocessing completed!")
    
    return target_df

In [None]:
## target_df 'tag' 전처리 (일정 기간 동안 target user가 본 article의 tag 빈도수 저장)
def target_recent_tag(target_df, metadata_df, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_recent_tag = pickle.load(fr)
        print("target_recent_tag file loaded..")

        target_df['recent_tag'] = target_recent_tag
        print("preprocessing completed!")
    
    else:
        target_recent_tag = []
        iteration = 0

        for idx in target_df['id'].values.tolist():
            # recent_list- > target user의 recent_list (일정 기간 동안 읽은 글 리스트)
            recent_list = target_df[target_df['id']==idx]['recent'].values[0][:]
            
            # 각 target user가 일정 기간 동안 읽은 글의 태그 합쳐서 저장 (중복 허용)
            keyword_list = metadata_df[metadata_df['id'].isin(recent_list)]['keyword_list'].tolist()
            read_tag = sum(keyword_list, [])
            
            # 각 target user가 일정 기간 동안 읽은 글의 태그들의 빈도수 저장
            frequency = Counter(read_tag)
            target_recent_tag.append(frequency)

            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_recent_tag, fw)
        print("target_recent_tag file saved..")
        
        target_df['recent_tag'] = target_recent_tag
        print("preprocessing completed!")        

    return target_df

In [None]:
## target_df 'interest' 전처리 (recent_tag에서 빈도수가 높은 상위 top_N개 관심키워드 저장)
def target_recent_interest(target_df, top_N, file):
    
    if os.path.isfile(file):
        with open(file,"rb") as fr:
            target_recent_interest = pickle.load(fr)
        print("target_recent_interest file loaded..")

        target_df['recent_interest'] = target_recent_interest
        print("preprocessing completed!")
    
    else:
        target_recent_interest = []
        iteration = 0      
        
        # recent_tag에서 빈도수가 높은 상위 top_N개의 키워드 저장
        for idx in target_df['id'].values.tolist():
            interest = []
            
            rt = target_df[target_df['id']==idx]['recent_tag'].values[0]
            sorted_rt = sorted(rt.items(), key=lambda x: x[1], reverse=True)
            
            for i in range(len(sorted_rt[:top_N])):
                interest.append(sorted_rt[:top_N][i][0])
                
            target_recent_interest.append(interest)
            
            # 진행 상황 표시
            iteration += 1
            print(iteration, '/', str(len(target_df['id'].values.tolist())), 'completed')
        
        with open(file,"wb") as fw:
            pickle.dump(target_recent_interest, fw)
        print("target_recent_interest file saved..")
        
        target_df['recent_interest'] = target_recent_interest
        print("preprocessing completed!")        

    return target_df

In [None]:
## target_df 'behavior' 전처리 (recent_following에서 f_ratio, recent_magazine에서 m_ratio 저장)
def target_recent_behavior(target_df):
    f_ratio_list = []
    m_ratio_list = []

    for idx in target_df['id'].values.tolist():
        recent_list = target_df[target_df['id']==idx]['recent'].values[0][:]

        fr_dic = target_df[target_df['id']==idx]['recent_following'].values[0]
        mr_dic = target_df[target_df['id']==idx]['recent_magazine'].values[0]

        f_ratio = sum(fr_dic.values()) / len(recent_list)
        m_ratio = sum(mr_dic.values()) / len(recent_list)      

        f_ratio_list.append(round(f_ratio,2))
        m_ratio_list.append(round(m_ratio,2))

    target_df['f_ratio'] = f_ratio_list
    target_df['m_ratio'] = m_ratio_list
    print("preprocessing completed!")
    
    return target_df

## 3. 추천 관련 함수

In [None]:
# item-based CF를 위한 item-user matrix 생성
def item_user_df_generator(metadata_df, read_raw_df, from_dt, to_dt):

    # users_read -> 일정 기간 동안 users가 읽은 article의 수 (중복 제거)
    partial_read = read_raw_df[(read_raw_df['dt'] >= from_dt) & (read_raw_df['dt'] <= to_dt)]
    users_read = partial_read[['user_id','article_id']].drop_duplicates().groupby('user_id').count()
    
    # not_cold_start_users(ncsu) -> 일정 기간 동안 읽은 article의 수가 평균보다 높은 users
    not_cold_start_users = users_read[users_read['article_id'] > users_read['article_id'].mean()].index.tolist()

    # not_long_tail_items(nlti) -> 일정 기간 동안 view가 상위 5%인 article
    not_long_tail_items = metadata_df[metadata_df['view'] > metadata_df['view'].quantile(0.95)]['id'].tolist()
    
    # ncsu와 nlti에 대해서 item-user matrix 생성
    iu = read_raw_df[read_raw_df['user_id'].isin(not_cold_start_users) & read_raw_df['article_id'].isin(not_long_tail_items)]
    iu = iu[['user_id','article_id']].drop_duplicates()
    iu['read']=1
    iu_df = iu.pivot(index='article_id', columns='user_id', values='read')
    iu_df = iu_df.fillna(0)
    
    return iu_df

In [None]:
# item-based CF 추천
def collaborative_filtering(idx, metadata_df, read_raw_df, r_list, recommended, from_dt, to_dt, file):

    if os.path.isfile(file):        
        with open(file,"rb") as fr:
            cf_dic = pickle.load(fr)
    
    else:
        # Step 1 : item-user matrix 생성
        iu_df = item_user_df_generator(metadata_df, read_raw_df, from_dt, to_dt)
        
        # Step 2: item-user martix에서 item에 대해 cosine similarity 구하기
        cosine_array = cosine_similarity(iu_df, iu_df)

        # Step 3: 가장 비슷한 100개의 item의 weighted mean을 이용해 predict
        predicted_array = np.zeros(shape=(len(iu_df.index),len(iu_df.columns)))          
            
        for i in range(len(cosine_array)):
            top_100 = cosine_array[i].argsort()[-101:][::-1]
            top_100 = np.delete(top_100, 0)

            weighted_sum = np.array([0])
            for top_idx in top_100:
                weighted_sum = weighted_sum + (cosine_array[i][top_idx] * iu_df.values[top_idx])
            predicted = weighted_sum / len(top_100)
            predicted_array[i] = predicted

        iu_predicted = iu_df.values*(-99999) + predicted_array

        # Step 4: 각 user에 대해 weighted mean이 높은 상위 100개 article을 저장
        cf_dic = {}
        for i in range(len(iu_predicted.T)):
            cf_dic[iu_df.columns[i]] = iu_df.index[iu_predicted.T[i].argsort()[-100:][::-1]].tolist()

        with open(file,"wb") as fw:
            pickle.dump(cf_dic, fw)
            
    cf_based_recommend_list = []
    
    if idx in cf_dic.keys():
        already = r_list + recommended
        n_rec = 10 #100-len(recommended)
        cf_based_recommend_list = pd.Series(cf_dic[idx])[pd.Series(cf_dic[idx]).isin(already)==False].tolist()[:n_rec]
        
    return cf_based_recommend_list

In [None]:
# target user가 최근에 읽은 글 중에서 구독작가 글의 비율을 고려하여 추천
def following_based_recommend(idx, target_df, metadata_df, r_list, recommended, f_ratio):
    
    already = r_list + recommended
    following_based_recommend_list = []
    
    # fr_dic -> {f_id,빈도수}
    fr_dic = target_df[target_df['id']==idx]['recent_following'].values[0]
        
    # sorted_fr -> 빈도수 순으로 정렬
    sorted_fr = sorted(fr_dic.items(), key=lambda x: x[1], reverse=True)
    
    for i in range(len(sorted_fr)):
        if sorted_fr[i][1] > 0:
            # n_rec -> 추천할 구독작가의 글의 개수 (빈도수가 높을수록 많이 추천됨)
            n_rec = int( (100-len(recommended)) * f_ratio * (sorted_fr[i][1]/sum(fr_dic.values())) )
            fr_article = metadata_df[metadata_df['user_id']==sorted_fr[i][0]]
            # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
            fr_candidate = fr_article[fr_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()
            following_based_recommend_list = following_based_recommend_list + fr_candidate
    
    return following_based_recommend_list

In [None]:
# target user가 최근에 읽은 글 중에서 매거진 글의 비율을 고려하여 추천
def magazine_based_recommend(idx, target_df, metadata_df, r_list, recommended):
    
    already = r_list + recommended
    magazine_based_recommend_list = []
    
    # mr_dic -> {m_id : 빈도수}
    mr_dic = target_df[target_df['id']==idx]['recent_magazine'].values[0]
    
    # sorted_mr -> 빈도수 순으로 정렬
    sorted_mr = sorted(mr_dic.items(), key=lambda x: x[1], reverse=True)

    for i in range(len(sorted_mr)):
        # n_rec -> 추천할 매거진 글의 개수 (빈도수가 높을수록 많이 추천됨)
        n_rec = int( (100-len(recommended)) * (sorted_mr[i][1]/sum(mr_dic.values())) )
        mr_article = metadata_df[metadata_df['magazine_id']==sorted_mr[i][0]]
        # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
        mr_candidate = mr_article[mr_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()
        magazine_based_recommend_list = magazine_based_recommend_list + mr_candidate
    
    return magazine_based_recommend_list

In [None]:
# target user가 최근에 읽은 글들에서 자주 나오는 태그를 고려하여 추천
def tag_based_recommend(idx, target_df, metadata_df, r_list, recommended):
    
    already = r_list + recommended

    # user_interest -> target user의 interest (0 ~ 6개)
    user_interest = target_df[target_df['id']== idx]['recent_interest'].values[0][:]
    
    # interest_article_id -> target user의 interest와 2개 이상 겹치는 글의 article_id
    interest_article_id = []
    for i in range(len(metadata_df)):
        if len(set(metadata_df['keyword_list'].values[i]) & set(user_interest)) >= 2:
            interest_article_id.append(metadata_df['id'].values[i])

    # 이미 본 글이나 이미 추천된 글을 제외하고 reg_ts 순으로 n_rec만큼 추천
    n_rec = 100-len(recommended)
    t_article = metadata_df[metadata_df['id'].isin(interest_article_id)]
    tag_based_recommend_list = t_article[t_article['id'].isin(already)==False].sort_values(['reg_ts'],ascending=[False])[:n_rec]['id'].tolist()

    return tag_based_recommend_list

In [None]:
# 일정 기간 동안 조회수가 높은 인기 글을 추천
def popularity_based_recommend(idx, target_df, metadata_df, read_raw_df, r_list, recommended):
    
    already = r_list + recommended
    
    # candidate -> 이미 본 글이나 추천된 글을 제외하고, view가 평균보다 많은 글
    candidate = metadata_df[(metadata_df['id'].isin(already)==False) & (metadata_df['view'] > metadata_df['view'].mean())]
    
    # user_interest -> target user의 interest (0 ~ 6개)
    user_interest = target_df[target_df['id']== idx]['recent_interest'].values[0][:]
    
    # interest_popularity_id -> target user의 interest 1개 이상 겹치는 popularity id
    interest_popularity_id = []
    for i in range(len(candidate)):
        if len(set(candidate['keyword_list'].values[i]) & set(user_interest)) >= 1:
            interest_popularity_id.append(candidate['id'].values[i])    
    
    # interest가 1개 이상 겹치는 글을 reg_ts순으로 추천하고, 남은 글은 popularity가 높은 순서대로 추천
    p_article = candidate[candidate['id'].isin(interest_popularity_id)].sort_values(['reg_ts'],ascending=[False])
    p_list = p_article[:(100-len(recommended))]['id'].tolist()
    if len(p_list) < (100-len(recommended)):
        p_list = p_list + candidate[candidate['id'].isin(p_list)==False][:(100-len(recommended)-len(p_list))]['id'].tolist()

    return p_list

In [None]:
def get_unix_time(reg_ts):
    string_time = str(reg_ts)[:4]+'-'+str(reg_ts)[4:6]+'-'+str(reg_ts)[6:8]+' 00:00:00'
    unix_time = time.mktime(datetime.datetime.strptime(string_time, '%Y-%m-%d %H:%M:%S').timetuple())*1000
    
    return unix_time

In [None]:
def recommender(target_list, target_df, metadata_df, read_raw_df, output_file):
    
    # 추천 실행 시간 측정
    startTime = time.time()
    
    # 최종 추천 리스트
    recommend_list = []
    
    # metadata_min -> 추천 기간을 포함한 최근 3개월 동안 발행된 글의 metadata
    metadata_min = metadata_df[(metadata_df['reg_ts'] >= get_unix_time(20181214)) & (metadata_df['reg_ts'] < get_unix_time(20190314))]
    
    # metadata_max -> 추천 기간 이후에 발행된 글을 제외한 글의 metadata
    metadata_max = metadata_df[metadata_df['reg_ts'] < get_unix_time(20190314)]
    
    # 진행 상황
    iteration = 0
    
    for idx in target_list:
        recommended = []

        r_list = target_df[target_df['id']==idx]['read'].values[0][:]
        recent = target_df[target_df['id']==idx]['recent'].values[0][:]
        f_ratio = target_df[target_df['id']==idx]['f_ratio'].values[0]
        m_ratio = target_df[target_df['id']==idx]['m_ratio'].values[0]

        # following_based 추천
        f = following_based_recommend(idx, target_df, metadata_max, r_list, recommended, f_ratio)
        recommended = recommended + f

        # magazine_based 추천
        m = magazine_based_recommend(idx, target_df, metadata_max, r_list, recommended)
        recommended = recommended + m

        # tag_bsaed 추천
        t = tag_based_recommend(idx, target_df, metadata_min, r_list, recommended)
        recommended = recommended + t
        
        # 100개 되지 않았다면 popularity_based 추천
        p = []
        if len(recommended) < 100:
            p = popularity_based_recommend(idx, target_df, metadata_max, read_raw_df, r_list, recommended)
            recommended = recommended + p
            
        # 추천 리스트 맨 앞에 user_id 추가
        recommended.insert(0, idx)                
        recommend_list.append(recommended)        


        # 진행 상황 표시
        iteration += 1
        print(str(iteration).rjust(4), '/', str(len(target_list)), 'completed', '\t', 'read:'+str(len(r_list)).rjust(5)+',', 'recent:'+str(len(recent)).rjust(5), '\t', 'fr:'+str(int(f_ratio*100)).rjust(3)+'%,', 'mr:'+str(int(m_ratio*100)).rjust(3)+'%', '\t', 'f:'+str(len(f)).rjust(3)+',', 'm:'+str(len(m)).rjust(3)+',', 't:'+str(len(t)).rjust(3)+',', 'p:'+str(len(p)).rjust(3), '\t','total:'+str(len(recommended)-1))


    # 추천 리스트를 파일로 저장
    f = open(output_file, 'w')
    for i in range(len(recommend_list)):
        for j in range(len(recommend_list[i])):
            f.write(recommend_list[i][j])
            if j == (len(recommend_list[i]) - 1):
                continue
            f.write(' ')
        f.write('\n')
    f.close()
    print('recommend.txt file saved..')
    print('completed!')
    
    endTime = time.time() - startTime
    print(int(endTime), 'seconds', '=', int(endTime/60), 'minutes')

    return recommend_list

## 4. 메인

### Step 1. metadata & read_raw 전처리

In [None]:
read_raw = read_raw_dt_preprocessing(read_raw)
metadata = metadata_view_preprocessing(metadata, read_raw, 20190215, 20190228)

### Step 2. target user 전처리

In [None]:
# # dev에 대해 추천할 경우
# dev = target_df_generator(dev_users, users)
# dev = target_read_article(dev, read_raw, './pickle/dev_read_article')
# dev = target_recent_article(dev, read_raw, 20190215, 20190228, './pickle/dev_recent_article_190215')
# dev = target_recent_following(dev, './pickle/dev_recent_following_190215')
# dev = target_recent_magazine(dev, metadata, './pickle/dev_recent_magazine_190215')
# dev = target_recent_tag(dev, metadata, './pickle/dev_recent_tag_190215')
# dev = target_recent_interest(dev, 6, './pickle/dev_recent_interest_190215')
# dev = target_recent_behavior(dev)

In [None]:
# test에 대해 추천할 경우
test = target_df_generator(test_users, users)
test = target_read_article(test, read_raw, './pickle/test_read_article')
test = target_recent_article(test, read_raw, 20190215, 20190228, './pickle/test_recent_article_190215')
test = target_recent_following(test, './pickle/test_recent_following_190215')
test = target_recent_magazine(test, metadata, './pickle/test_recent_magazine_190215')
test = target_recent_tag(test, metadata, './pickle/test_recent_tag_190215')
test = target_recent_interest(test, 6, './pickle/test_recent_interest_190215')
test = target_recent_behavior(test)

### Step 3. target user 추천

In [None]:
# # dev에 대해 추천할 경우
# recommend = recommender(dev_users, dev, metadata, read_raw, './recommend.txt')

In [None]:
# test에 대해 추천할 경우
recommend = recommender(test_users, test, metadata, read_raw, './recommend.txt')