In [1]:
import operator
import time
import random
import pandas as pd
import numpy as np

In [2]:
# load data
start = time.time()
table = pd.read_table('user_taggedbookmarks-timestamps.dat', sep='\t')
end = time.time()
print("execute time: ", '%.5f'%(end-start), 's\n')
table.info()

execute time:  0.13997 s

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437593 entries, 0 to 437592
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   userID      437593 non-null  int64
 1   bookmarkID  437593 non-null  int64
 2   tagID       437593 non-null  int64
 3   timestamp   437593 non-null  int64
dtypes: int64(4)
memory usage: 13.4 MB


In [3]:
table.head()

Unnamed: 0,userID,bookmarkID,tagID,timestamp
0,8,1,1,1289255362000
1,8,2,1,1289255159000
2,8,7,1,1289238901000
3,8,7,6,1289238901000
4,8,7,7,1289238901000


In [4]:
n1 = table['userID'].nunique()
n2 = table['bookmarkID'].nunique()
n3 = table['tagID'].nunique()
print(n1, n2, n3)

1867 69223 40897


In [5]:
ratio = 0.2
seed = 88
random.seed(seed)   ##设定seed后，使用 random() 生成的随机数将会是同一个

start = time.time()

stratify_count = table.groupby(by='userID')['userID'].count()  ##统计每个userID有多少条记录
stratify_df = pd.DataFrame({'count':stratify_count})    ##插入count字段    
stratify_df['test_num'] = (stratify_df['count'] * ratio).apply(int)     ##插入test_num字段，test数据集的比例个数
test_id = []
train_id = []

stratify_df['ids'] = stratify_df.index.map(lambda x: table[table['userID'] == x].index.tolist())    ##统计每个user包含的记录
stratify_df['test_index'] = stratify_df.apply(lambda x: random.sample(x['ids'], x['test_num']), axis=1)     ##划分test集的内容
stratify_df['train_index'] = stratify_df.apply(lambda x: list(set(x['ids']) - set(x['test_index'])), axis =1)   ##划分train集的内容
stratify_df['test_index'].apply(lambda x: test_id.extend(x))        ##将test_index中内容注入test_id，下同
stratify_df['train_index'].apply(lambda x: train_id.extend(x))

mid = time.time()

train_data = table.iloc[train_id].reset_index(drop=True)    ##drop：是否drop掉原index
test_data = table.iloc[test_id].reset_index(drop=True)      ##按test_index的内容(index)取回table中相应的记录，上同...

end = time.time()
print("execute time: ", '%.5f'%(end-mid), 's')
print('Split train test dataset by stratification, time took: %.5f' % (end - start))

execute time:  0.05650 s
Split train test dataset by stratification, time took: 1.15017


In [6]:
stratify_df.head()

Unnamed: 0_level_0,count,test_num,ids,test_index,train_index
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,153,30,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[101, 48, 85, 47, 3, 134, 2, 87, 118, 34, 57, ...","[0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
32,25,5,"[153, 154, 155, 156, 157, 158, 159, 160, 161, ...","[159, 160, 158, 153, 173]","[154, 155, 156, 157, 161, 162, 163, 164, 165, ..."
57,60,12,"[178, 179, 180, 181, 182, 183, 184, 185, 186, ...","[211, 207, 228, 237, 179, 230, 209, 223, 202, ...","[178, 180, 181, 182, 183, 184, 185, 186, 187, ..."
147,410,82,"[238, 239, 240, 241, 242, 243, 244, 245, 246, ...","[494, 272, 489, 313, 433, 244, 308, 302, 457, ...","[238, 239, 240, 241, 242, 243, 245, 247, 249, ..."
233,343,68,"[648, 649, 650, 651, 652, 653, 654, 655, 656, ...","[911, 717, 895, 801, 962, 964, 792, 694, 918, ...","[648, 650, 651, 652, 653, 654, 655, 657, 658, ..."


In [7]:
stratify_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1867 entries, 8 to 108035
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   count        1867 non-null   int64 
 1   test_num     1867 non-null   int64 
 2   ids          1867 non-null   object
 3   test_index   1867 non-null   object
 4   train_index  1867 non-null   object
dtypes: int64(2), object(3)
memory usage: 87.5+ KB


In [8]:
start = time.time()

# user -> item
user_item = train_data.groupby(by=['userID', 'bookmarkID'])['tagID'].count()    ##user对每个item打了几个tag
mid1 = time.time()
# user -> tag
user_tag = train_data.groupby(by=['userID', 'tagID'])['bookmarkID'].count()     ##user为每个tag添加的item数
mid2 = time.time()
# tag -> item
tag_item = train_data.groupby(by=['tagID', 'bookmarkID'])['userID'].count()     ##为同一个item打了同一个tag的人数
mid3 = time.time()
# tag -> user
tag_user = train_data.groupby(by=['tagID', 'userID'])['bookmarkID'].count()     ##同user_tag，不同显示方式
end = time.time()

print('execute time: ', '%.5f'%(mid1-start))
print('execute time: ', '%.5f'%(mid2-mid1))
print('execute time: ', '%.5f'%(mid3-mid2))     # why longer time?
print('execute time: ', '%.5f'%(end-mid3))

execute time:  0.04535
execute time:  0.05944
execute time:  0.10071
execute time:  0.06288


In [9]:
user_item.head()
# user_tag.head()

userID  bookmarkID
8       1             1
        2             1
        7             1
        8             3
        9             2
Name: tagID, dtype: int64

In [10]:
user_id = 8
n = 10

In [11]:
def calculate(user_id, user_item, user_tag, tag_item, n, method):
    marked_item = user_item[user_id].index      # 该用户mark过的item是哪些
    recommend = {}
    # user_id -> tag -> item -> count
    marked_tag = user_tag.loc[user_id]          # 该用户mark过的每个tag的次数
    marked_tag_sum = marked_tag.values.sum()    # 该用户mark过的tag的总数

    for tag_index, tag_count in marked_tag.iteritems():     # 每次取marked_tag的一行，此处都是该用户用过的tag
        selected_item = tag_item.loc[tag_index]             # 每个item被打上该tag的次数
        selected_item_sum = selected_item.values.sum()      # 统计该tag的总人次
        tag_selected_users_sum = tag_user.loc[tag_index].values.sum()   # 统计该tag的总次数
        for item_index, tag_item_count in selected_item.iteritems():    # 该用户用过的tag包含的item及这些item的被tag次数（所有人），这些item有来自该用户的，也有其他人的
            if item_index in marked_item:
                continue
            if item_index not in recommend:
                if method == 'norm':
                    recommend[item_index] = (tag_count / marked_tag_sum) * (tag_item_count / selected_item_sum)
                elif method == 'simple':
                    recommend[item_index] = tag_count * tag_item_count
                elif method == 'tfidf':
                    recommend[item_index] = tag_count / np.log(1 + tag_selected_users_sum) * tag_item_count
                else:
                    raise TypeError("Invalid method '{}'".format(method))
            else:       # 针对被不同tag标过的同一个item（未被该用户消费过的）
                if method == 'norm':
                    recommend[item_index] += (tag_count / marked_tag_sum) * (tag_item_count / selected_item_sum)
                elif method == 'simple':
                    recommend[item_index] += tag_count * tag_item_count
                elif method == 'tfidf':
                    recommend[item_index] += tag_count / np.log(1 + tag_selected_users_sum) * tag_item_count
                else:
                    raise TypeError("Invalid method '{}'".format(method))
    sorted_recommend = sorted(recommend.items(), key=lambda x: (x[1]), reverse=True)[:n]
    return {user_id: dict(sorted_recommend)}

In [12]:
def predict(user_id, n, method='simple'):
    return calculate(user_id, user_item, user_tag, tag_item, n, method)

In [24]:
start = time.time()

p1_simple = predict(32, 10)
mid1 = time.time()

p1_tf = predict(32, 10, method='tfidf')
mid2 = time.time()

p1_normal = predict(32, 10, method='norm')
end = time.time()

print('execute time: ', '%.5f'%(mid1 - start) , 's')
print('execute time: ', '%.5f'%(mid2 - mid1) , 's')
print('execute time: ', '%.5f'%(end - mid2) , 's')

execute time:  0.03130 s
execute time:  0.04545 s
execute time:  0.02567 s


In [19]:
p1_simple

{32: {1554: 10,
  358: 8,
  363: 8,
  2178: 7,
  4725: 7,
  10230: 7,
  3112: 7,
  3520: 7,
  7974: 7,
  1188: 6}}

In [20]:
p1_tf

{32: {1554: 1.8151777552757613,
  17072: 1.3216359572210803,
  10230: 1.2643893116093072,
  4725: 1.2206854319065445,
  5388: 1.1820550639861649,
  1188: 1.1015768873329084,
  10271: 1.1015768873329084,
  18345: 1.1015768873329084,
  2178: 0.9744659045581325,
  358: 0.9528683565890902}}

In [21]:
p1_normal

{32: {5388: 0.017997367216117218,
  12912: 0.016666666666666666,
  12306: 0.007154151473103359,
  37: 0.007142857142857143,
  6680: 0.007142857142857143,
  6686: 0.007142857142857143,
  9961: 0.007142857142857143,
  26299: 0.007142857142857143,
  17072: 0.003774350649350649,
  6468: 0.003314083020637899}}