# Tag-based Recommendation

## 获取数据

In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['photo_id', 'tag_id']

In [3]:
df = pd.read_csv('../scidata/photo_tag.csv', names=header)

In [4]:
df

Unnamed: 0,photo_id,tag_id
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
5,1,6
6,1,7
7,1,8
8,1,9
9,1,10


In [5]:
print('Redundant photo_id and tag_id num: {0}'.format(df[df.duplicated()].shape[0]))

Redundant photo_id and tag_id num: 156


In [6]:
photo_num = df.photo_id.unique().shape[0]
tag_num = df.tag_id.unique().shape[0]

In [7]:
print('Real Photo Num: {0}, Real Tag Num: {1}'.format(photo_num, tag_num))

Real Photo Num: 8837, Real Tag Num: 19368


## 建立图片标签矩阵

In [8]:
photo_tag_matrix = np.zeros((photo_num, tag_num))
for row in df.itertuples():
    photo_id = row[1]-1
    tag_id = row[2]-1
    photo_tag_matrix[photo_id, tag_id] = 1

In [9]:
photo_tag_matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
photo_tag_matrix.shape

(8837, 19368)

In [11]:
unique, counts = np.unique(photo_tag_matrix, return_counts=True)
dict(zip(unique, counts))

{0.0: 171037845, 1.0: 117171}

## 相似度计算

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
%%time
photo_similarity = cosine_similarity(photo_tag_matrix)
np.fill_diagonal(photo_similarity, 0)
# np.count_nonzero(photo_similarity)

Wall time: 35.1 s


In [14]:
photo_similarity

array([[0.        , 0.        , 0.06063391, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06063391, 0.        , 0.        , ..., 0.        , 0.        ,
        0.10846523],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.18257419,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.18257419, 0.        ,
        0.        ],
       [0.        , 0.        , 0.10846523, ..., 0.        , 0.        ,
        0.        ]])

In [15]:
photo_similarity.shape

(8837, 8837)

## 推荐

In [16]:
rec_result = photo_similarity

TOP_K_NUM = 10
rec_list = []
for photo_id in range(rec_result.shape[0]):
    rec_item_per_photo = rec_result[photo_id].argsort()[-TOP_K_NUM:][::-1].tolist()
    rec_list.append(rec_item_per_photo)

In [17]:
print('rec_list shape: ({0}, {1})'.format(len(rec_list), len(rec_list[0])))

rec_list shape: (8837, 10)


In [18]:
rec_list # 8837 * 10 photo_id-1 photo_id-1

[[106, 20, 46, 129, 68, 435, 6, 7, 5580, 76],
 [78, 84, 54, 47, 91, 117, 94, 50, 125, 89],
 [34, 15, 3, 5849, 5882, 5888, 5896, 1462, 1493, 12],
 [1478, 15, 71, 8121, 105, 8276, 133, 2, 37, 1452],
 [108, 79, 37, 58, 78, 112, 84, 80, 117, 99],
 [2127, 2202, 2219, 2149, 2212, 2213, 2215, 2155, 2093, 2220],
 [86, 45, 70, 97, 134, 69, 42, 123, 23, 101],
 [8157, 106, 101, 109, 0, 5813, 17, 40, 68, 114],
 [123, 87, 9, 103, 45, 40, 23, 69, 42, 114],
 [123, 119, 40, 87, 42, 114, 28, 69, 8, 23],
 [82, 140, 35, 22, 3907, 129, 68, 3899, 124, 77],
 [45, 69, 24, 42, 114, 32, 70, 87, 23, 9],
 [5888, 5896, 5882, 5849, 23, 8409, 14, 136, 2927, 24],
 [36, 95, 1045, 100, 6213, 81, 91, 47, 60, 8259],
 [136, 7217, 28, 45, 7977, 7259, 49, 55, 23, 11],
 [1478, 3, 2635, 8400, 139, 2055, 2, 2911, 5586, 1452],
 [790, 3854, 100, 1442, 122, 113, 36, 132, 784, 797],
 [68, 77, 2959, 129, 35, 5904, 7, 106, 2254, 3543],
 [108, 37, 41, 84, 121, 90, 59, 78, 62, 122],
 [1443, 10, 129, 68, 20, 7189, 77, 1446, 82, 2456],