In [2]:
#baseline
#分析：要求根据用户历史浏览点击新闻文章的数据信息预测用户未来的点击行为， 即用户的最后一次点击的新闻文章
#和之前的分类预测不同点：
#1）预测最后一次点击的新闻文章，也就是我们给用户【推荐】的新闻文章，之前的一般是预测一个数，或预测数据是哪一类；
#2）数据，之前的分类是数据本身有特征，预测它的结果标签，而推荐相关的数据是基于了真实的业务场景，拿到的是用户的点击日志；
#所以我们的目标是，把该预测问题转成一个有监督学习的问题（特征+标签），然后才能进行ML,DL等建模预测。
#那么，如何转？能利用的特征有哪些？有哪些模型可尝试？对数万级别的文章推荐，有哪些策略？
#问题变成了一个点击率预测的问题(用户, 文章) --> 点击的概率(软分类)，建模的大致方向 - 逻辑回归
#大致的解决思路：先转成一个分类问题，分类的标签就是用户是否会点击某篇文章，分类问题的特征中会有用户和文章，
#要训练一个分类模型， 对某用户最后一次点击某篇文章的概率进行预测
#其它问题：如何转成监督学习问题？ 训练集和测试集怎么制作？ 我们又能利用哪些特征？ 我们又可以尝试哪些模型？ 面对36万篇文章， 20多万用户的推荐， 我们又有哪些策略来缩减问题的规模？如何进行最后的预测？

In [1]:
#Baseline - 协同过滤(给用户推荐它买过的其它商品等)
#参考 http://datawhale.club/t/topic/196
# import packages
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
import collections
from collections import defaultdict
warnings.filterwarnings('ignore')

In [2]:
data_path = './data/'
save_path = './dataRs/'

In [3]:
# 节约内存的一个标配函数
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [4]:
#读取采样或全量数据
def get_all_click_sample(data_path,sample_nums = 10000):
    #参数：原数据的存储路径，采样数目
    all_click = pd.read_csv(data_path + 'train_click_log.csv')
    #print(all_click)
#    user_id  click_article_id  click_timestamp  click_environment  \
#0         199999            160417    1507029570190                  4  
#    click_deviceGroup  click_os  click_country  click_region  \
#0                        1        17              1            13  
#    click_referrer_type  
#0                          1  
    all_user_ids = all_click.user_id.unique()#取用户id号，去重
    #print(all_user_ids)
    #[199999 199998 199997 ...      2      1      0]
    
    #随机选取文章Id
    sample_user_ids = np.random.choice(all_user_ids,size = sample_nums, replace = False)
    
    all_click = all_click[all_click['user_id'].isin(sample_user_ids)]
    
    all_click = all_click.drop_duplicates((['user_id','click_article_id','click_timestamp']))
    #drop_duplicates去重
    
    return all_click
#返回是一个df表

    

In [5]:
get_all_click_sample(data_path,10).head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
11823,195731,159847,1507037504749,4,3,2,1,21,1
11824,195731,156624,1507037534749,4,3,2,1,21,1
17411,193778,363984,1507042261383,4,1,17,1,16,5
17412,193778,220293,1507042291383,4,1,17,1,16,5
113945,162548,159762,1507132671830,4,1,17,1,12,2


In [6]:
#读全部数据：线上线下分读数据
def get_all_click_df(data_path='./data/',offline=True):
    if offline:#只读训练集
        all_click = pd.read_csv(data_path  + 'train_click_log.csv')
    else:#训练集合并测试集
        trn_click = pd.read_csv(data_path + 'train_click_log.csv')
        tst_click = pd.read_csv(data_path + 'testA_click_log.csv')
        
        all_click = trn_click.append(tst_click)
        
    #去重
    all_click = all_click.drop_duplicates((['user_id','click_article_id','click_timestamp']))
#     print(all_click[:10])
    return all_click
        

In [7]:
get_all_click_df().head()
# user_id  click_article_id  click_timestamp  click_environment  \
# 0   199999            160417    1507029570190                  4   
# click_deviceGroup  click_os  click_country  click_region  \
# 0                  1        17              1            13   
# click_referrer_type  
# 0                    1

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,199999,160417,1507029570190,4,1,17,1,13,1
1,199999,5408,1507029571478,4,1,17,1,13,1
2,199999,50823,1507029601478,4,1,17,1,13,1
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5


In [8]:
#获取制作 用户-文章-点击时间字典
# 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    
    click_df = click_df.sort_values('click_timestamp')
    
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'],df['click_timestamp']))
    
    user_item_time_df = click_df.groupby('user_id')['click_article_id','click_timestamp'].apply(lambda x :make_item_time_pair(x) )\
    .reset_index().rename(columns = {0:'item_time_list'})
    
    user_item_time_dict = dict(zip(user_item_time_df['user_id'],user_item_time_df['item_time_list']))
    
    return user_item_time_dict
    

In [9]:
get_user_item_time(get_all_click_df().head())
# {199998: [(157770, 1507029532200), (96613, 1507029671831)],
#  199999: [(160417, 1507029570190),
#   (5408, 1507029571478),
#   (50823, 1507029601478)]}

{199998: [(157770, 1507029532200), (96613, 1507029671831)],
 199999: [(160417, 1507029570190),
  (5408, 1507029571478),
  (50823, 1507029601478)]}

In [10]:
#获取点击最多的Topk个文章
def get_itme_topk_click(click_df,k):
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click

In [12]:
get_itme_topk_click(get_all_click_df().head(),3)
#Int64Index([50823, 96613, 157770], dtype='int64'))

Int64Index([50823, 96613, 157770], dtype='int64')

In [36]:
#itemCF的物品(文章)相似度计算
#【物品相似度计算公式用了IUF,
#即认为活跃用户对物品相似度的贡献应该小于不活跃的用户，所以增加一个IUF（Inverse User Frequence）参数来修正物品相似度的计算公式】
#【对数+1倒数做分子惩罚活跃用户，分母用点击自己的与点击其它的相乘再开根。】
def itemcf_sim(df):#df数据表
    
    user_item_time_dict = get_user_item_time(df)#取数据：文章创建时间的字典
    
    #计算物品相似度
    i2i_sim = {}#相似度字典？
    item_cnt = defaultdict(int)#整型，默认为0
    for user, item_time_list in tqdm(user_item_time_dict.items()):#遍历数据，用户文章号与时间字典。tqdm打印循环进度条。
        for i, i_click_time in item_time_list:#遍历用户文章号与时间列表
            item_cnt[i] += 1 #文章id加为键值，点击过累加1
            i2i_sim.setdefault(i,{})#setdefault，i若不在字典中添加键
            
            for j,j_click_time in item_time_list:
                if(i==j):#行列物品相同，自己和自己忽略跳过
                    continue
                i2i_sim[i].setdefault(j,0)#i为本篇文章，j为与它有关系的文章，默认设相似度为0
                i2i_sim[i][j] += 1/ math.log(len(item_time_list)+1)#【与其它商品的相似度？math.log取对数，对数+1的倒数，惩罚活跃用户】
#                 print(i2i_sim)
                #{157770: {96613: 0.9102392266268373}, 96613: {157770: 0.9102392266268373}, 160417: {5408: 0.7213475204444817, 50823: 0.7213475204444817},
    i2i_sim_ = i2i_sim.copy()
    for i, related_items in i2i_sim.items():#遍历{文章id:{有关文章:关联度}...
        for j,wij in related_items.items():#遍历{有关文章:关联度}
            
#             print(item_cnt[i])
#             print(item_cnt[j])
            i2i_sim_[i][j] = wij/math.sqrt(item_cnt[i]*item_cnt[j])#item_cnt[i]点击文章i的用户数，item_cnt[j]点击其它文章用户数数
#             print(i2i_sim_)
            
#     print(i2i_sim_)#只测几条，都是1，1*1开方还是1，所以和上面相同，忽略不用管
#     {157770: {96613: 0.9102392266268373}, 96613: {157770: 0.9102392266268373}, 160417: {5408: 0.7213475204444817, 50823: 0.7213475204444817}, 5408: {160417: 0.7213475204444817, 50823: 0.7213475204444817}, 50823: {160417: 0.7213475204444817, 5408: 0.7213475204444817}}      
                
   #将得到的相似性矩阵保存到本地
    pickle.dump(i2i_sim_,open(save_path+'itemcf_i2i_sim.pkl','wb'))
    
    return i2i_sim_
      
    
    
    

In [37]:
# itemcf_sim(get_all_click_df().head())

100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]


{157770: {96613: 0.9102392266268373},
 96613: {157770: 0.9102392266268373},
 160417: {5408: 0.7213475204444817, 50823: 0.7213475204444817},
 5408: {160417: 0.7213475204444817, 50823: 0.7213475204444817},
 50823: {160417: 0.7213475204444817, 5408: 0.7213475204444817}}

In [18]:
# from tqdm import tqdm
# import time
 
# for i in tqdm(range(100)):
#   time.sleep(0.1)
#   pass
#tqdm展示步骤进度条

In [19]:
# item_cnt = defaultdict(int)
# for i, i_click_tim in [(157770, 1507029532200), (96613, 1507029671831)]:
#     item_cnt[i] += 1
# print(item_cnt)
#defaultdict(<class 'int'>, {157770: 1, 96613: 1})  

defaultdict(<class 'int'>, {157770: 1, 96613: 1})


In [39]:
# 全量训练集
all_click_df = get_all_click_df(offline=False)

In [40]:
i2i_sim = itemcf_sim(all_click_df)

100%|███████████████████████████████████████████████████████████████████████| 250000/250000 [00:23<00:00, 10737.57it/s]


In [54]:
#itemCF 的文章推荐
#基于商品（文章）的招回（排序推荐）i2i
#【思路：排序，选中，不够数的补热点新闻】
def item_based_recommend(user_id,user_item_tiem_dict,i2i_sim,sim_item_topk,recall_item_num,item_topk_click):
    #user_id: 用户id
    #user_item_time_dict拼接数据{user1: [(item1文章id, time1时间戳), (item2, time2)
    #i2i_sim文章相似度矩阵
    #sim_item_topk 选择最相似topk
    #recall_item_num 最后召回（推荐？）文章数，自定义推荐几篇，比如10
    #item_topk_click 文章点击列表，点击次数最多的文章列表，热门文章列表，用于补充召回
    #多路召回会加入关联规则召回策略
    
    #获取用户历史交互文章，组装数据字典user_item_time_dict
    user_hist_items = user_item_time_dict[user_id]
    #单独提一份用户id
    user_hist_items_ = {user_id for user_id,_ in user_hist_items}
    
    item_rank = {}
    for loc, (i,click_time) in enumerate(user_hist_items):
        for j, wij in sorted(i2i_sim[i].items(),key=lambda x:x[1],reverse = True)[:sim_item_topk]:#遍历排序前topk个数组
            if j in user_hist_items_:#用户Id列表
                continue
            item_rank.setdefault(j,0)#没有j的话，加入j键
            item_rank[j] +=wij
            
    #不足10个用热门商品补全
    if len(item_rank) < recall_item_num:
        for i,item in enumerate(item_topk_click):
            if item in item_rank.items():#过滤掉已有的
                continue
            item_rank[item] = -i - 100#【推荐权值给个负数，不要超过之前算出来的推荐的】
            if len(item_rank) == recall_item_num:#补购数停止
                break
                
    #排序
    item_rank = sorted(item_rank.items(),key = lambda x:x[1],reverse = True)[:recall_item_num]
    
    
    return item_rank
    


In [47]:
# 获取近期热点(点击量)文章
def get_item_topk_click(click_df,k):#数据df,topk篇
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click
    


In [55]:
#==主函数执行==
#定义一个字典
user_recall_items_dict = collections.defaultdict(dict)
#获取组装好的数据
user_item_time_dict = get_user_item_time(all_click_df)
#读取文章相似度
i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl','rb'))
#取top 10 相似文章推荐
sim_item_topk = 10
#召回文章数（设定推荐文章数）
recall_item_num = 10
#不足的用热度文章补
item_topk_click = get_item_topk_click(all_click_df,k=50)

for user in tqdm(all_click_df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user,user_item_time_dict,i2i_sim,sim_item_topk,recall_item_num,item_topk_click)

100%|██████████████████████████████████████████████████████████████████████████| 250000/250000 [44:33<00:00, 93.52it/s]


In [46]:
# from collections import defaultdict
# s=[('yellow',1),('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
# d=defaultdict(list)
# print(d)
# for k, v in s:
#     d[k].append(v)
# a=sorted(d.items())
# print(a)

defaultdict(<class 'list'>, {})
[('blue', [2, 4]), ('red', [1]), ('yellow', [1, 3])]


In [56]:
#==把上面召回文章字典user_recall_items_dict转换成df==
#把字典中的三项，转到列表中，生成df
user_item_score_list = []
for user,items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user,item,score])
        
recall_df = pd.DataFrame(user_item_score_list,columns = ['user_id','click_article_id','pred_score'])

100%|███████████████████████████████████████████████████████████████████████| 250000/250000 [00:03<00:00, 67511.18it/s]


In [57]:
recall_df.head()

Unnamed: 0,user_id,click_article_id,pred_score
0,199999,276970,0.172377
1,199999,158536,0.106969
2,199999,286321,0.097774
3,199999,108855,0.092462
4,199999,162655,0.091407


In [58]:
# 生成提交文件
#选5篇写入结果提交文件
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [59]:
# 获取测试集
tst_click = pd.read_csv(data_path + 'testA_click_log.csv')
tst_users = tst_click['user_id'].unique()

# 从所有的召回数据中将测试集中的用户选出来
tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]

# 生成提交文件
submit(tst_recall, topk=5, model_name='itemcf_baseline')

In [None]:
#参考：http://datawhale.club/t/topic/196