In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data_dir = 'C:/ZhangLI/Codes/DataSet/kdd2020'
user_data_dir = 'user_data'

train_file_prefix = 'underexpose_train_click'
test_file_prefix = 'underexpose_test_click'
infer_file_prefix = 'underexpose_test_qtime'

offline_train_path = os.path.join(user_data_dir, 'offline_underexpose_train')
offline_test_path = os.path.join(user_data_dir, 'offline_underexpose_test')
offline_answer_path = os.path.join(user_data_dir, 'offline_underexpose_answer')

mode = 'online'
online_train_path = os.path.join(data_dir, 'underexpose_train')
online_test_path = os.path.join(data_dir, 'underexpose_test')

train_path = online_train_path if mode == 'online' else offline_train_path
test_path = online_test_path if mode == 'online' else offline_test_path


In [3]:
def get_phase_click(c):
    '''
    get click data of target phase
    :param c: target phase
    :return: all_click (includes train and test), click_q_time (infer data, i.e., user_id q_time)
    '''
    print('train_path={}, test_path={}, target_phase={}'.format(train_path, test_path, c))

    click_train = pd.read_csv('{}/{}-{}.csv'.format(train_path, train_file_prefix, c), header=None,
                              names=['user_id', 'item_id', 'time'])

    phase_test_path = "{}/{}-{}".format(test_path, test_file_prefix, c)
    click_test = pd.read_csv('{}/{}-{}.csv'.format(phase_test_path, test_file_prefix, c), header=None,
                             names=['user_id', 'item_id', 'time'])

    # phase_test_path = "{}/{}-{}".format(test_path, infer_file_prefix, c)
    click_q_time = pd.read_csv('{}/{}-{}.csv'.format(phase_test_path, infer_file_prefix, c), header=None,
                               names=['user_id', 'time'])

    all_click = click_train.append(click_test)

    return all_click, click_q_time

In [4]:
def get_whole_click(now_phase):
    '''
    get whole click
    :return: whole click data
    '''
    print('train_path={}, test_path={}'.format(train_path, test_path))
    whole_click = pd.DataFrame()
    for c in range(now_phase + 1):
        click_train = pd.read_csv('{}/{}-{}.csv'.format(train_path, train_file_prefix, c), header=None,
                                  names=['user_id', 'item_id', 'time'])
        phase_test_path = "{}/{}-{}".format(test_path, test_file_prefix, c)
        click_test = pd.read_csv('{}/{}-{}.csv'.format(phase_test_path, test_file_prefix, c), header=None,
                                  names=['user_id', 'item_id', 'time'])
        all_click = click_train.append(click_test)
        all_click['phase'] = c
        whole_click = whole_click.append(all_click)

    print(whole_click.shape)
    whole_click = whole_click.drop_duplicates(['user_id', 'item_id', 'time'])
    print(whole_click.shape)
    return whole_click

In [5]:
all_click, click_q_time = get_phase_click(2)

train_path=C:/ZhangLI/Codes/DataSet/kdd2020\underexpose_train, test_path=C:/ZhangLI/Codes/DataSet/kdd2020\underexpose_test, target_phase=2


In [6]:
whole_click = get_whole_click(2)

train_path=C:/ZhangLI/Codes/DataSet/kdd2020\underexpose_train, test_path=C:/ZhangLI/Codes/DataSet/kdd2020\underexpose_test
(795911, 4)
(477251, 4)


In [12]:
# 1. 从whole_click删除该用户 大于phase的样本
# 2. 从whole_click删除不在该phase中的item
def group_apply_func(group_df):
    u = group_df['user_id'].iloc[0]  # get user id
    if u in pred_user_time_dict:
        u_time = pred_user_time_dict[u]
        group_df = group_df[group_df['time'] <= u_time]
    return group_df
phase_whole_click = whole_click.groupby('user_id', group_keys=False).apply(group_apply_func)  # groupby 之后的groupby的用户, 删除的是全部点击中的时间穿越的样本
phase_whole_click = phase_whole_click[phase_whole_click['item_id'].isin(phase_item_ids)]

In [11]:
# debug code
whole_click_group = whole_click.groupby('user_id', group_keys=False)
for key, value in whole_click_group:
    print(value.shape)
    # print(value['user_id'].iloc[0])  
    u = value['user_id'].iloc[0]
    if u in pred_user_time_dict:  # 这这一块感觉可以加规则，训练的时候不用，但是实际已经拿到了商品
        u_time = pred_user_time_dict[2]
        value = value[value['time'] <= u_time]  # 删除这部分
    print(value.shape)
    break

(26, 4)
(26, 4)


In [13]:
phase_item_ids = set(all_click['item_id'].unique())  # 当前phase的item背点击的情况
pred_user_time_dict = dict(zip(click_q_time['user_id'], click_q_time['time']))  # zip返回的可被迭代的对象，直存在一个 迭代周期中


In [14]:
item_cnt_dict = all_click.groupby('item_id')['user_id'].count().to_dict()  # 商品被用户点击的次数
user_cnt_dict = all_click.groupby('user_id')['item_id'].count().to_dict()  # user点击商品的次数
user_cnt_dict

{1: 14,
 2: 21,
 3: 9,
 4: 3,
 6: 8,
 7: 3,
 9: 3,
 10: 29,
 11: 9,
 12: 4,
 13: 7,
 14: 3,
 16: 4,
 19: 9,
 22: 7,
 25: 6,
 26: 10,
 27: 13,
 29: 8,
 30: 19,
 34: 4,
 35: 10,
 36: 15,
 41: 22,
 42: 5,
 43: 4,
 44: 9,
 45: 70,
 47: 14,
 50: 7,
 51: 24,
 53: 4,
 55: 12,
 57: 12,
 59: 7,
 60: 4,
 61: 12,
 62: 12,
 63: 8,
 64: 8,
 65: 5,
 66: 17,
 68: 5,
 70: 28,
 71: 12,
 72: 12,
 73: 9,
 76: 22,
 77: 6,
 78: 5,
 81: 8,
 83: 5,
 85: 13,
 88: 11,
 89: 7,
 90: 17,
 91: 10,
 92: 20,
 95: 5,
 97: 24,
 98: 19,
 99: 9,
 100: 18,
 101: 4,
 102: 4,
 104: 3,
 106: 22,
 107: 16,
 110: 25,
 111: 16,
 112: 17,
 114: 16,
 117: 19,
 121: 4,
 122: 7,
 123: 14,
 124: 18,
 127: 37,
 129: 13,
 131: 4,
 132: 23,
 133: 18,
 134: 12,
 139: 11,
 140: 7,
 143: 28,
 144: 11,
 145: 22,
 150: 7,
 154: 4,
 155: 21,
 157: 8,
 158: 3,
 159: 7,
 163: 10,
 167: 6,
 168: 3,
 169: 6,
 174: 17,
 176: 19,
 177: 15,
 179: 7,
 180: 11,
 185: 33,
 186: 8,
 187: 10,
 188: 5,
 190: 9,
 191: 20,
 192: 18,
 193: 4,
 194: 29,
 19

In [73]:
phase_whole_click  # history_df  df

Unnamed: 0,user_id,item_id,time,phase
19829,1,78142,0.983742,0
20480,1,89568,0.983763,0
84964,1,69359,0.983942,0
111177,1,18522,0.983887,0
159250,1,85492,0.983875,0
...,...,...,...,...
16203,35422,72136,0.984062,2
18733,35422,26290,0.984061,2
55200,35424,54561,0.984009,1
60888,35424,28462,0.984007,1


In [17]:
# debug code
print('item-cf item-sim begin')
# get_user_item_time_dict
phase_whole_click # 历史点击记录，删除部分信息
import copy
df = copy.deepcopy(phase_whole_click)
user_item_ = df.sort_values(by=['user_id', 'item_id'])
groupb = user_item_.groupby('user_id').apply(
        lambda group: make_item_time_tuple(group, 'user_id', 'item_id', 'time'))   # 里面传入的是value, 也就是一个dataframe
# for key, value in groupb:
#     print(key, value)
#     print(list(zip(value['item_id'], value['time'])))
#     break
groupb

item-cf item-sim begin


user_id
1        [(4033, 0.9840438262024572), (13025, 0.9840606...
2        [(2494, 0.983984766262873), (4168, 0.983871694...
3        [(5266, 0.9838992954632594), (26297, 0.9840589...
4        [(6609, 0.9837908698075156), (8291, 0.98379061...
6        [(18551, 0.9840493988352508), (40062, 0.983993...
                               ...                        
35417    [(5408, 0.9840399613832004), (11372, 0.9840398...
35419    [(29184, 0.9839495072000888), (54039, 0.983995...
35421    [(28043, 0.983992772861173), (58341, 0.9839514...
35422    [(11885, 0.9840610961277478), (22446, 0.984061...
35424    [(4724, 0.9840090157009675), (28462, 0.9840071...
Length: 23668, dtype: object

In [15]:
# item-cf
import copy
df = copy.deepcopy(phase_whole_click)
user_item_ = df.sort_values(by=['user_id', 'item_id'])
def make_item_time_tuple(group_df, user_col='user_id', item_col='item_id', time_col='time'):
    # group_df = group_df.drop_duplicates(subset=[user_col, item_col], keep='last')
    # print(group_df)
    item_time_tuples = list(zip(group_df[item_col], group_df[time_col]))
    # print(item_time_tuples)
    return item_time_tuples
user_item_ = user_item_.groupby('user_id').apply(
        lambda group: make_item_time_tuple(group, 'user_id', 'item_id', 'time')).reset_index().rename(
        columns={0: 'item_id_time_list'})
user_item_time_dict = dict(zip(user_item_['user_id'], user_item_['item_id_time_list']))
# output: {1: [(4033, 0.9840438262024572),
#   (13025, 0.9840606898359422),
#   (14665, 0.9840478512672396),
#   (14722, 0.9840379236152932),
#   (18522, 0.9838865445879744),
#   (28869, 0.9840298463575488)

In [13]:
user_item_time_dict

{1: [(4033, 0.9840438262024572),
  (13025, 0.9840606898359422),
  (14665, 0.9840478512672396),
  (14722, 0.9840379236152932),
  (18522, 0.9838865445879744),
  (28869, 0.9840298463575488),
  (31443, 0.9838869477253404),
  (68164, 0.9840607907780058),
  (69359, 0.9839419314982224),
  (78142, 0.9837416195438412),
  (81337, 0.9838866278651768),
  (85482, 0.9840377949141624),
  (85492, 0.9838754813378136),
  (88234, 0.9840477610502704),
  (89568, 0.983763437539981),
  (89568, 0.9840479755521552)],
 2: [(2494, 0.983984766262873),
  (4168, 0.9838716947486562),
  (5461, 0.9839484498319736),
  (7745, 0.9837833717048604),
  (8481, 0.983830155827746),
  (18057, 0.984039389167878),
  (20507, 0.9840391172551944),
  (23824, 0.9838716802382346),
  (28099, 0.9837834694924844),
  (35688, 0.9839836382353132),
  (38861, 0.9840580047770529),
  (40083, 0.9839834098538944),
  (40696, 0.984039382858999),
  (53046, 0.9840387298900256),
  (56169, 0.9840033490658774),
  (58621, 0.9838837213646356),
  (83466, 0.

In [18]:
from collections import defaultdict
import math
sim_item = {}
item_cnt = defaultdict(int) 
# item_cnt[1]
# count = 0
for usesr, item_time_list in user_item_time_dict.items():
    for loc_1, (item, item_time) in enumerate(item_time_list):  # i=item_id, relate_item=item_id
#         print(loc_1, (item, item_time))
#         print(sim_item)
        item_cnt[item] += 1  # item热度
        sim_item.setdefault(item,{})  # 查找键值，如果不存在则设置{}
        for loc_2, (relate_item, related_time) in enumerate(item_time_list):
            if item==relate_item:
                continue
            loc_alpha = 1.0 if loc_2 > loc_1 else 0.7
            loc_weight = loc_alpha * (0.8 ** (np.abs(loc_2 - loc_1) - 1))  # 计算item 相似度的公式
            time_weight = np.exp(-15000 * np.abs(item_time - related_time))
#             # print(i, relate_item, loc_alpha, loc_weight, i_time, related_time, time_weight)
            sim_item[item].setdefault(relate_item, 0)
            sim_item[item][relate_item] += loc_weight * time_weight / math.log(1 + len(item_time_list))
    
    
#     print(item_cnt)
#     print(sim_item)
#     print('-'*410)
#     count += 1
#     if count == 2:
#         break
# 1. 聚合每个用户的点击item
# 2. 计算该用户组的item相似度
# 3. 累计全部用户组的item相似度

In [19]:
# 将所有用户对商品的偏好
sim_item_corr = sim_item.copy()
for i, related_items in sim_item.items():
    # print(i, related_items)
    for j, cij in related_items.items():
        sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j])

In [78]:
len(sim_item)
len(sim_item_corr[3])

41031

In [96]:
phase_whole_click
# bi-graph cf
# get_item_user_time_dict
item_user_df = phase_whole_click.sort_values(by=['item_id', 'time'])
item_user_df

Unnamed: 0,user_id,item_id,time,phase
212360,2329,3,0.983750,0
80483,18247,3,0.983761,0
146520,8022,3,0.983774,0
58649,5206,3,0.983864,0
80482,24081,3,0.983904,0
...,...,...,...,...
169729,1482,117448,0.984064,2
155150,7396,117471,0.983989,2
132943,27221,117471,0.983992,2
66348,9393,117471,0.983997,2


In [20]:
# debug code 
# 尝试user-cf
# get_sim_user get_user_item_time_dict get_item_user_time_dict

# phase_whole_click  # get_user_item_time_
user_item_ = phase_whole_click.sort_values(by=['user_id', 'item_id'])  # 还是优先排列的user，其次再是item
user_item_ = user_item_.groupby('user_id').apply(lambda group: list(zip(group['item_id'], group['time']))).reset_index().rename(columns={0:'item_id_time_list'})
user_item_time_dict = dict(zip(user_item_['user_id'], user_item_['item_id_time_list']))

# get_item_user_time_dict
item_user_df = df.sort_values(by=['item_id', 'time'])
# item_user_df.groupby('item_id')
item_user_df = item_user_df.groupby('item_id').apply(lambda group: list(zip(group['user_id'], group['time']))).reset_index().rename(columns={0:'user_id_time_list'})

item_user_time_dict = dict(zip(item_user_df['item_id'], item_user_df['user_id_time_list']))
item_user_time_dict[3]
# 每个item被点击信息  每个用户主动点击信息


[(2329, 0.9837495447576023),
 (18247, 0.9837609411165728),
 (8022, 0.9837744175129391),
 (5206, 0.9838644944250884),
 (24081, 0.9839040037796344),
 (10370, 0.98392570001441),
 (5788, 0.9839522585022076),
 (13264, 0.9840001592966704),
 (30594, 0.9840058051124602),
 (2934, 0.9840068788836608),
 (26499, 0.9840447807358452)]

In [21]:
# 用户相似度 代码的具体实现，最终呈现的格式是 字典
sim_user = {}
user_cnt = defaultdict(int)
for item, user_time_list in item_user_time_dict.items():
    # print(user_time_list)
    for user, time in user_time_list:
        user_cnt[user] += 1
        sim_user.setdefault(user, {})
        for relate_user, relate_time in user_time_list:
            if user==relate_user:
                continue
            sim_user[user].setdefault(relate_user, 0)
            sim_user[user][relate_user] += 1.0 / math.log(1+len(user_time_list))  # math.log(1+len(user_time_list)) item被点击的越活跃，影响力越低

sim_user_corr = sim_user.copy()
for u, related_users in sim_user.items():
    for v, cuv in related_users.items():
        sim_user_corr[u][v] = cuv / math.sqrt(user_cnt[u] * user_cnt[v])
    

In [22]:
# 用户相似度计算完成，存储结果
sim_user_corr[1][27849]

0.026883034851913006

In [26]:
# DEBUG CODE 
# Swing
item_user_df = phase_whole_click.sort_values(by=['item_id', 'time'])
item_user_df = item_user_df.groupby('item_id').apply(lambda group: list(zip(group['user_id'], group['time']))).reset_index().rename(columns={0: 'user_id_time_list'})
item_user_time_dict = dict(zip(item_user_df['item_id'], item_user_df['user_id_time_list'])) # (item:[user,time])

In [60]:
# 有三个默认的字典
user_item_time_dict = defaultdict(list)
item_cnt = defaultdict(int)
u_u_cnt = defaultdict(list)
iii = 0
for item, user_time_list in item_user_time_dict.items():
    print(item, user_time_list)
    
    for u, u_time in user_time_list:
        #print([u, u_time])
        item_cnt[item] += 1  # item被点击的次数
        user_item_time_dict[u].append((item, u_time))  # {user:[item, time]}
        
        for relate_u, relate_u_time in user_time_list:
            #print([relate_u, relate_u_time])
            if relate_u == u:
                continue
            key = (u, relate_u) if u <= relate_u else (relate_u, u)
            u_u_cnt[key].append((item, np.abs(u_time - relate_u_time)))  # [user, user, item, d12]  对于同一个item， 不同用户的点击时间差
            #print(key, (item, np.abs(u_time - relate_u_time)))
        
    
    sim_item = {}
    for u_u, co_item_times in u_u_cnt.items():
        print(u_u, co_item_times)
        num_co_items = len(co_item_times)
        for i, i_time_diff in co_item_times:
            sim_item.setdefault(i, {})
            for j, j_item_diff in co_item_times:
                if j==i:
                    continue
                print(sim_item[i].setdefault(j, 0.) + 1.0/(5.0+num_co_items))
    print(sim_item)  
    iii += 1
    if iii == 2:
        break
    # break

3 [(2329, 0.9837495447576023), (18247, 0.9837609411165728), (8022, 0.9837744175129391), (5206, 0.9838644944250884), (24081, 0.9839040037796344), (10370, 0.98392570001441), (5788, 0.9839522585022076), (13264, 0.9840001592966704), (30594, 0.9840058051124602), (2934, 0.9840068788836608), (26499, 0.9840447807358452)]
(2329, 18247) [(3, 1.1396358970405807e-05), (3, 1.1396358970405807e-05)]
(2329, 8022) [(3, 2.487275533680222e-05), (3, 2.487275533680222e-05)]
(2329, 5206) [(3, 0.00011494966748604174), (3, 0.00011494966748604174)]
(2329, 24081) [(3, 0.00015445902203203854), (3, 0.00015445902203203854)]
(2329, 10370) [(3, 0.0001761552568076974), (3, 0.0001761552568076974)]
(2329, 5788) [(3, 0.00020271374460523184), (3, 0.00020271374460523184)]
(2329, 13264) [(3, 0.0002506145390680503), (3, 0.0002506145390680503)]
(2329, 30594) [(3, 0.00025626035485781795), (3, 0.00025626035485781795)]
(2329, 2934) [(3, 0.0002573341260584394), (3, 0.0002573341260584394)]
(2329, 26499) [(3, 0.0002952359782428804

In [49]:
user_item_time_dict

defaultdict(list,
            {2329: [(3, 0.9837495447576023)],
             18247: [(3, 0.9837609411165728)],
             8022: [(3, 0.9837744175129391)],
             5206: [(3, 0.9838644944250884)],
             24081: [(3, 0.9839040037796344)],
             10370: [(3, 0.98392570001441)],
             5788: [(3, 0.9839522585022076)],
             13264: [(3, 0.9840001592966704)],
             30594: [(3, 0.9840058051124602)],
             2934: [(3, 0.9840068788836608)],
             26499: [(3, 0.9840447807358452)]})