# 新闻推荐赛题
数据包含:30万用户, 36万文章, 300万次点击.
其中20万用户的数据作为训练集, 5万作为测试集A, 5万作为测试集B
## 数据表
train_click_log.csv：训练集用户点击日志

testA_click_log.csv：测试集用户点击日志

articles.csv：新闻文章信息数据表

articles_emb.csv：新闻文章embedding向量表示

sample_submit.csv：提交样例文件

## 字段表
Field	Description

user_id	用户id

click_article_id	点击文章id

click_timestamp	点击时间戳

click_environment	点击环境

click_deviceGroup	点击设备组

click_os	点击操作系统

click_country	点击城市

click_region	点击地区

click_referrer_type	点击来源类型

article_id	文章id，与click_article_id相对应

category_id	文章类型id

created_at_ts	文章创建时间戳

words_count	文章字数

emb_1,emb_2,…,emb_249	文章embedding向量表示

## 结果提交
与sample_submit.csv一致,格式如下:
```
user_id,article_1,article_2,article_3,article_4,article_5
```
表示的是预测用户点击新闻文章的Top5

## 评分方式
$$
MRR = score(user) = \sum^5_{k=1} \frac{s(user,k)}{k}
$$
s(user,k)=1当且仅当预测的5个值中含有最后一个购买记录.

In [1]:
train_path = r'./data/train_click_log.csv'
test_path = r'./data/testA_click_log.csv'
article_path = r'./data/articles.csv'
article_emb_path = r'./data/articles_emb.csv'

In [2]:
import pandas as pd
import numpy as np
from time import time
pd.set_option('max_rows', 100)      #设置最大行数和列数
pd.set_option('max_columns', 100)

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
article = pd.read_csv(article_path)
train = train.append(test)    #要把测试集加入到训练集中去,这样来获取测试结果

In [4]:
print(train.columns)
print(test.columns)

Index(['user_id', 'click_article_id', 'click_timestamp', 'click_environment',
       'click_deviceGroup', 'click_os', 'click_country', 'click_region',
       'click_referrer_type'],
      dtype='object')
Index(['user_id', 'click_article_id', 'click_timestamp', 'click_environment',
       'click_deviceGroup', 'click_os', 'click_country', 'click_region',
       'click_referrer_type'],
      dtype='object')


In [5]:
test = test.sort_values(by='user_id')
test.head(20)

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
138222,200000,195839,1507030363999,4,1,17,1,17,1
138223,200000,191971,1507030393999,4,1,17,1,17,1
378656,200000,194300,1507651461280,4,1,17,1,17,1
138221,200001,175040,1507029536442,4,3,2,1,18,7
138219,200002,297906,1507029946064,4,1,17,1,8,1
138220,200002,298310,1507029976064,4,1,17,1,8,1
138218,200002,300473,1507029771255,4,1,17,1,8,1
190153,200002,159762,1507134895237,4,1,17,1,8,2
251049,200002,70335,1507291678460,4,1,17,1,8,1
251050,200002,207714,1507292884188,4,1,17,1,8,1


In [6]:
'''我还并没有看懂,所以在我程序里没有使用'''
# 节约内存的一个标配函数
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

## 第一步: 对数据集进行处理
**这一步的目的是得到每一个用户的新闻点击序列,这个序列使用一个字典来进行存储. e.g {user_id1:[(item_id1, timestamp), (item_id2, timestamp), ...], ...}**

- 先按照时间戳顺序进行排序,然后制造一个it列来存储每一行的(item_id, timestamp)组合
- 根据user_id进行分组
- 把每个分组中it列的值取出来,就形成了其新闻点击序列

In [7]:
user_item_time_df = train.sort_values(by='click_timestamp')
user_item_time_df['it'] = user_item_time_df.apply(lambda x: (x[1], x[2]), axis=1)
user_item_time_df.head(10)

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,it
18,249990,162300,1506959050386,4,3,20,1,25,2,"(162300, 1506959050386)"
2,249998,160974,1506959056066,4,1,12,1,13,2,"(160974, 1506959056066)"
30,249985,160974,1506959062960,4,1,17,1,8,2,"(160974, 1506959062960)"
50,249979,162300,1506959063933,4,1,17,1,25,2,"(162300, 1506959063933)"
25,249988,160974,1506959064384,4,1,17,1,21,2,"(160974, 1506959064384)"
52,249978,158082,1506959064972,4,1,17,1,13,1,"(158082, 1506959064972)"
97,249956,158536,1506959069231,2,4,2,1,25,7,"(158536, 1506959069231)"
45,249980,160974,1506959069331,4,1,17,1,21,1,"(160974, 1506959069331)"
32,249984,300470,1506959071824,4,1,17,1,25,5,"(300470, 1506959071824)"
103,249954,160974,1506959080055,4,1,17,1,25,2,"(160974, 1506959080055)"


In [8]:
user_item_time_df = user_item_time_df[['user_id', 'it']].groupby('user_id')
def get_it(x):
    return x['it'].values
utt = user_item_time_df.apply(get_it)

In [9]:
train.describe()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,1630633.0,1630633.0,1630633.0,1630633.0,1630633.0,1630633.0,1630633.0,1630633.0,1630633.0
mean,155546.3,194725.2,1507524000000.0,3.947632,1.791299,13.21313,1.322667,18.1722,1.88133
std,67543.84,91475.43,359895300.0,0.3264832,1.03128,6.86681,1.645919,7.091755,1.178867
min,0.0,3.0,1506959000000.0,1.0,1.0,2.0,1.0,1.0,1.0
25%,105501.0,124177.0,1507218000000.0,4.0,1.0,2.0,1.0,13.0,1.0
50%,168192.0,202381.0,1507562000000.0,4.0,1.0,17.0,1.0,21.0,2.0
75%,212724.0,277107.0,1507767000000.0,4.0,3.0,17.0,1.0,25.0,2.0
max,249999.0,364046.0,1510603000000.0,4.0,5.0,20.0,11.0,28.0,7.0


## 第二步: 构建item-item相似矩阵
**这一步的目的是构建item间的相似性矩阵. 这也是协同过滤算法中的关键步骤**
- 使用i2i_sim来记录相似度矩阵,使用item_num来记录item在所有用户历史记录中出现的次数
- 使用Jaccard距离来计算相似度

<font color=red>注:</font>
- 为什么用字典来存储相似度矩阵和物品频次: 因为矩阵非常稀疏, 用numpy向量来存储非常浪费空间. 其次, item_id并不连续,这样导致使用numpy存储还需要建立索引映射,非常麻烦.
- Jaccard距离:
$$
w_{ij}=\frac{\sum_{u \in N(i)\bigcap N(j)}\frac{1}{log(1+|N(u)|)}}{\sqrt{|N(i)||N(j)|}}
$$
分子中的倒数惩罚了用户u和用户v共同兴趣列表中热门物品对他们相似度的影响.

In [10]:
from tqdm import tqdm
import math
from collections import defaultdict
i2i_sim = {}    #记录物品与物品的相似度矩阵
item_num = defaultdict(int)    #记录每个物品出现的次数
for user, it_list in tqdm(utt.items()):
    for i, i_click_time in it_list:
        item_num[i] += 1
        i2i_sim.setdefault(i, {})    #有该物品的时候再为其创建一个空矩阵
        for j, j_click_time in it_list:
            if(i==j):
                continue
            i2i_sim[i].setdefault(j, 0)    #如果不存在的话就设置为0
            i2i_sim[i][j] += 1 / math.log(len(it_list) + 1)
for i, related_items in i2i_sim.items():
    for j, wij in related_items.items():
        i2i_sim[i][j] = wij / math.sqrt(item_num[i] * item_num[j])

250000it [00:31, 7989.14it/s]


## 第三步: 根据相似度矩阵给出每个用户的召回列表
**这一步主要是利用相似矩阵,去计算和用户历史交互相关的物品的分数**
- 先给出两个限定条件:sim_num和recall_num. sim_num是确定了只利用与用户历史交互过物品相似度前几个的物品来给出推荐. 而recall_num则是召回列表的长度.
- 把每个历史物品的相似物品的分数相加,得到总的召回列表及每个推荐物品的分数.
- 利用得分进行排序,给出召回列表.
- 若列表长度不够要求的长度, 则使用频次最高的几个物品进行推荐.

In [11]:
# 获取最热门的n个文章
n = 10
sort_list = sorted(item_num.items(), key=lambda d: d[1], reverse=True)
popular = sort_list[0: n]
popular = [tt[0] for tt in popular]
popular


[272143, 234698, 123909, 336221, 96210, 336223, 183176, 168623, 162655, 331116]

In [12]:
# 给定用户的历史记录, 相似性矩阵,计算出所有物品对于该用户的得分
def get_user_like_list(user_id, sim, user_time_item_dict, sim_num, recall_num, popular_list):
    user_hist = user_time_item_dict[user_id]
    user_hist_list = [item_id for item_id, _ in user_hist]
    item_rank = {}

    '''计算出除了已经交互过物品的其他物品的评分'''
    for i, (item_id, click_time) in enumerate(user_hist):
        for j, wij in sorted(sim[item_id].items(), key=lambda x: x[1], reverse=True)[:sim_num]:
            if j in user_hist_list:    #对于已经交互过不在进行计算
                continue
            item_rank.setdefault(j, 0)
            item_rank[j] += wij

    '''当召回的物品个数不够时, 使用热点物品进行补充'''
    if len(item_rank) < recall_num:
        for item_id in popular_list:
            if item_id in item_rank.keys():
                continue
            else:
                item_rank[item_id] = -1
            if len(item_rank) == recall_num:
                break
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_num]
    item_rank = [tt[0] for tt in item_rank]
    return item_rank

In [13]:
sim_num = 10
recall_num = 5
recall_list = get_user_like_list(user_id=249998, sim=i2i_sim, user_time_item_dict=utt, sim_num=sim_num, recall_num=recall_num, popular_list=popular)
recall_list

[300470, 123909, 16129, 276970, 162655]

In [16]:
recall_dict = defaultdict(dict)
'''对test集合里的所有用户进行召回'''
for user_id in tqdm(test['user_id'].unique()):
    recall_dict[user_id] = get_user_like_list(user_id=user_id, sim=i2i_sim, user_time_item_dict=utt, sim_num=sim_num, recall_num=recall_num, popular_list=popular)

recall_df = pd.DataFrame(recall_dict).T

100%|██████████| 50000/50000 [18:46<00:00, 44.39it/s]


AttributeError: 'collections.defaultdict' object has no attribute 'head'

## 提交
提交文件名以时间来命名

In [30]:
#recall_df = recall_df.T
#recall_df = recall_df.reset_index()
recall_df.columns = ['user_id', 'article_1', 'article_2', 'article_3', 'article_4', 'article_5']
#recall_df.drop('level_0', axis=1, inplace=True)
recall_df.head(10)


Unnamed: 0,user_id,article_1,article_2,article_3,article_4,article_5
0,200000,237870,194619,194935,314048,195773
1,200001,64329,272143,199198,324823,166581
2,200002,300128,300923,61375,293301,298035
3,200003,337143,272143,156619,235230,158536
4,200004,235870,235616,336223,261612,156964
5,200005,69932,160974,156964,160417,158536
6,200006,199197,284547,235230,183176,206934
7,200007,289003,157478,97530,218028,66672
8,200008,235870,300082,156560,64409,336223
9,200009,199198,64329,198659,166581,324823


In [31]:
import time
path_time = time.strftime("%Y%m%d%H%M%S", time.localtime())
submit_path = '../submission/' + path_time + '.csv'
recall_df.to_csv(submit_path, index=False)