# Image-based CF

## 获取数据

In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['user_id', 'photo_id', 'is_fav']

In [3]:
df = pd.read_csv('../scidata/user_photo_fav.csv', names=header)

In [4]:
df

Unnamed: 0,user_id,photo_id,is_fav
0,1,5236,1
1,2,1,1
2,2,11,1
3,2,20,1
4,2,24,1
5,2,36,1
6,2,82,1
7,2,83,1
8,2,84,1
9,2,107,1


In [5]:
print('Redundant row num: {0}'.format(df[df.duplicated()].shape[0]))

Redundant row num: 0


In [6]:
user_number = df.user_id.unique().shape[0]
photo_number = df.photo_id.unique().shape[0]

In [7]:
print('Real User Num: {0}, Real Photo Num: {1}'.format(user_number, photo_number))
USER_NUMBER = 23259
PHOTO_NUMBER = 8837

Real User Num: 23185, Real Photo Num: 8837


## 建立评分矩阵

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_data, test_data = train_test_split(df, test_size=0.2)

In [10]:
train_matrix = np.zeros((USER_NUMBER, PHOTO_NUMBER))
for row in train_data.itertuples():
    train_matrix[row[1]-1, row[2]-1] = row[3]

In [11]:
train_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
unique, counts = np.unique(train_matrix, return_counts=True)
dict(zip(unique, counts))

{0.0: 205469087, 1.0: 70696}

In [13]:
test_matrix = np.zeros((USER_NUMBER, PHOTO_NUMBER))
for row in test_data.itertuples():
    test_matrix[row[1]-1, row[2]-1] = row[3]

In [14]:
test_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [15]:
unique, counts = np.unique(test_matrix, return_counts=True)
dict(zip(unique, counts))

{0.0: 205522109, 1.0: 17674}

## 相似度计算

In [16]:
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [17]:
item_similarity = cosine_similarity(train_matrix.T)

In [18]:
item_similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.13608276],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.11785113,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.11785113, 1.        ,
        0.14433757],
       [0.        , 0.13608276, 0.        , ..., 0.        , 0.14433757,
        1.        ]])

## 预测

In [19]:
rec_result = train_matrix.dot(item_similarity)

In [20]:
# 归一化
image_based_prediction = rec_result / np.array([np.abs(item_similarity).sum(axis=1)])

In [21]:
np.count_nonzero(image_based_prediction)

5069926

In [22]:
image_based_prediction

array([[0.        , 0.00291399, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.3921716 , 0.0104254 , 0.        , ..., 0.        , 0.        ,
        0.01601418],
       [0.37070712, 0.01741202, 0.        , ..., 0.        , 0.00682712,
        0.0089522 ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.07168385, 0.00643667,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [23]:
image_based_prediction.shape

(23259, 8837)

## 预测准确度评价

In [24]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [25]:
test_matrix_flatten = test_matrix[test_matrix.nonzero()].flatten()

In [26]:
image_based_prediction = image_based_prediction[test_matrix.nonzero()].flatten()

In [27]:
sqrt(mean_squared_error(image_based_prediction, test_matrix_flatten))

0.9613504458022978

## 实行算法

In [28]:
TOP_K_NUM = 10

def get_recommend_list_by_itemcf():
    header = ['user_id', 'photo_id', 'is_fav']
    df = pd.read_csv('../scidata/user_photo_fav.csv', names=header)

    fav_matrix = np.zeros((USER_NUMBER, PHOTO_NUMBER))
    for row in df.itertuples():
        user_id = row[1] - 1  # real user id need + 1
        photo_id = row[2] - 1
        is_fav = row[3]
        fav_matrix[user_id, photo_id] = is_fav

    item_similarity = cosine_similarity(fav_matrix.T)
    rec_result = fav_matrix.dot(item_similarity)
    # 归一化
    rec_result = rec_result / np.array([np.abs(item_similarity).sum(axis=1)])

    rec_list = []
    for user_id in range(rec_result.shape[0]):
        rec_item_per_user = rec_result[user_id].argsort()[-TOP_K_NUM:][::-1].tolist()
        rec_list.append(rec_item_per_user)
    return rec_list

rec_list = get_recommend_list_by_itemcf()

In [29]:
print('rec_list shape: ({0}, {1})'.format(len(rec_list), len(rec_list[0])))

rec_list shape: (23259, 10)


In [30]:
rec_list # 23259 * 10 user_id-1 photo_id-1

[[5235, 2487, 5860, 6560, 4275, 5198, 509, 1259, 1026, 5244],
 [3893, 3899, 19, 5174, 3331, 1861, 2290, 5214, 35, 106],
 [4739, 4937, 2725, 8170, 4944, 4870, 521, 440, 566, 4929],
 [67, 7558, 19, 106, 35, 7614, 7638, 0, 138, 649],
 [491, 518, 593, 4389, 138, 579, 4484, 4491, 571, 4376],
 [1822, 7593, 2409, 1829, 97, 2473, 2446, 7648, 34, 7555],
 [120, 42, 68, 12, 106, 46, 86, 28, 56, 49],
 [67, 19, 106, 0, 138, 77, 35, 22, 649, 7558],
 [19, 138, 7167, 35, 67, 82, 0, 106, 124, 140],
 [0, 19, 106, 67, 35, 1822, 138, 82, 518, 491],
 [90, 30, 19, 106, 105, 0, 67, 57, 35, 41],
 [1383, 1352, 1416, 1372, 1347, 1367, 1362, 1402, 1403, 1374],
 [3842, 3302, 8178, 2681, 2522, 8218, 3300, 2660, 8208, 2657],
 [1729, 6974, 1, 1756, 1772, 5664, 693, 1766, 1765, 1769],
 [70, 89, 8, 99, 46, 1, 28, 18, 101, 68],
 [6003, 5993, 6029, 5994, 6041, 6006, 5969, 5992, 6035, 6054],
 [1, 5664, 41, 6003, 3455, 5993, 7650, 7558, 8, 68],
 [7558, 7650, 7617, 7562, 7592, 571, 7631, 7670, 582, 8015],
 [5664, 41, 3455,