In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

In [2]:
print('初始化变量...')
names = ['user_id', 'item_id', 'rating', 'timestamp']
trainingset_file = 'dataset/ml-100k/u3.base'
testset_file= 'dataset/ml-100k/u3.test'
n_users = 943
n_items = 1682
ratings = np.zeros((n_users, n_items))

初始化变量...


In [3]:
df = pd.read_csv(trainingset_file, sep='\t', names=names)
print('载入训练集...')
print('数据集样例为:')
print(df.head())
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
print('载入完成.')
print('打分矩阵规模为 %d*%d.' % (n_users, n_items))
print('训练集有效打分个数为 %d.' % len(df))

载入训练集...
数据集样例为:
   user_id  item_id  rating  timestamp
0        1        1       5  874965758
1        1        2       3  876893171
2        1        3       4  878542960
3        1        4       3  876893119
4        1        6       5  887431973
载入完成.
打分矩阵规模为 943*1682.
测试集有效打分个数为 80000.


In [4]:
# 计算矩阵密度
def cal_sparsity():
    sparsity = float(len(ratings.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    print('训练集矩阵密度为: {:4.2f}%'.format(sparsity))

cal_sparsity()
print()

测试集矩阵密度为: 5.04%



In [5]:
def rmse(pred, actual):
    '''计算预测结果的rmse'''
    from sklearn.metrics import mean_squared_error
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [6]:
print('------ Naive算法(baseline) ------')

------ Naive算法(baseline) ------


In [7]:
def cal_mean():
    '''Calculate mean value'''
    print('计算总体均值，各user打分均值，各item打分均值...')
    global all_mean, user_mean, item_mean
    all_mean = np.mean(ratings[ratings!=0])
    user_mean = sum(ratings.T) / sum((ratings!=0).T)
    item_mean = sum(ratings) / sum((ratings!=0))
    print('是否存在User/Item 均值为NaN?', np.isnan(user_mean).any(), np.isnan(item_mean).any())
    print('对NaN填充总体均值...')
    user_mean = np.where(np.isnan(user_mean), all_mean, user_mean)
    item_mean = np.where(np.isnan(item_mean), all_mean, item_mean)
    print('是否存在User/Item 均值为NaN?', np.isnan(user_mean).any(), np.isnan(item_mean).any())
    print('均值计算完成，总体打分均值为 %.4f' % all_mean)

In [8]:
print('计算训练集各项统计数据...')
cal_mean()

计算训练集各项统计数据...
计算总体均值，各user打分均值，各item打分均值...
是否存在User/Item 均值为NaN? False True
对NaN填充总体均值...
是否存在User/Item 均值为NaN? False False
均值计算完成，总体打分均值为 3.5311


In [9]:
def predict_naive(user, item):
    prediction = item_mean[item] + user_mean[user] - all_mean
    return prediction

In [10]:
print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用Naive算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_naive(user, item))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

载入测试集...
测试集大小为 20000
采用Naive算法进行预测...
测试结果的rmse为 0.9691



In [11]:
print('------ item-item协同过滤算法(相似度未归一化) ------')

------ item-item协同过滤算法(相似度未归一化) ------


In [12]:
def cal_similarity(ratings, kind, epsilon=1e-9):
    '''利用Cosine距离计算相似度'''
    '''epsilon: 防止Divide-by-zero错误，进行矫正'''
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [13]:
print('计算相似度矩阵...')
user_similarity = cal_similarity(ratings, kind='user')
item_similarity = cal_similarity(ratings, kind='item')
print('计算完成.')
print('相似度矩阵样例: (item-item)')
print(np.round_(item_similarity[:10,:10], 3))

计算相似度矩阵...
计算完成.
相似度矩阵样例: (item-item)
[[ 1.     0.296  0.279  0.388  0.252  0.114  0.518  0.41   0.416  0.199]
 [ 0.296  1.     0.177  0.405  0.211  0.099  0.331  0.31   0.207  0.152]
 [ 0.279  0.177  1.     0.275  0.118  0.104  0.311  0.125  0.207  0.121]
 [ 0.388  0.405  0.275  1.     0.265  0.091  0.411  0.391  0.357  0.219]
 [ 0.252  0.211  0.118  0.265  1.     0.016  0.28   0.214  0.202  0.031]
 [ 0.114  0.099  0.104  0.091  0.016  1.     0.128  0.065  0.164  0.139]
 [ 0.518  0.331  0.311  0.411  0.28   0.128  1.     0.342  0.43   0.279]
 [ 0.41   0.31   0.125  0.391  0.214  0.065  0.342  1.     0.364  0.166]
 [ 0.416  0.207  0.207  0.357  0.202  0.164  0.43   0.364  1.     0.25 ]
 [ 0.199  0.152  0.121  0.219  0.031  0.139  0.279  0.166  0.25   1.   ]]


In [14]:
def predict_itemCF(user, item, k=100):
    '''item-item协同过滤算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    prediction = ratings[user, nzero].dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero])
    return prediction

In [15]:
print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用item-item协同过滤算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_itemCF(user, item))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

载入测试集...
测试集大小为 20000
采用item-item协同过滤算法进行预测...
测试结果的rmse为 1.0042



In [16]:
print('------ 结合baseline的item-item协同过滤算法(相似度未归一化) ------')

------ 结合baseline的item-item协同过滤算法(相似度未归一化) ------


In [17]:
def predict_itemCF_baseline(user, item, k=100):
    '''结合baseline的item-item CF算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item]
    return prediction 

In [18]:
print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用结合baseline的item-item协同过滤算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_itemCF_baseline(user, item))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

载入测试集...
测试集大小为 20000
采用结合baseline的item-item协同过滤算法进行预测...
测试结果的rmse为 0.9345



In [19]:
print('------ user-user协同过滤算法(相似度未归一化) ------')

def predict_userCF(user, item, k=100):
    '''user-user协同过滤算法,预测rating'''
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = ratings[nzero, item].dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero])
    # 冷启动问题: 该item暂时没有评分
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction

print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用user-user协同过滤算法进行预测...')

for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_userCF(user, item))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

------ user-user协同过滤算法(相似度未归一化) ------
载入测试集...
测试集大小为 20000
采用user-user协同过滤算法进行预测...
测试结果的rmse为 1.0133



In [20]:
print('------ 结合baseline的user-user协同过滤算法(相似度未归一化) ------')

def predict_userCF_baseline(user, item, k=100):
    '''结合baseline的user-user协同过滤算法,预测rating'''
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = (ratings[nzero, item] - baseline[nzero]).dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero]) + baseline[user]
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction

print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用结合baseline的user-user协同过滤算法进行预测...')

for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_userCF_baseline(user, item))
    targets.append(actual)
    
print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

------ 结合baseline的user-user协同过滤算法(相似度未归一化) ------
载入测试集...
测试集大小为 20000
采用结合baseline的user-user协同过滤算法进行预测...
测试结果的rmse为 0.9519



In [21]:
print('------ 经过修正后的协同过滤 ------')
def predict_biasCF(user, item, k=100):
    '''结合baseline的item-item CF算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item]
    if prediction > 5:
        prediction = 5
    if prediction < 1:
        prediciton = 1
    return prediction

print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用结合baseline的item-item协同过滤算法进行预测...')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_biasCF(user, item))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

------ 经过修正后的协同过滤 ------
载入测试集...
测试集大小为 20000
采用结合baseline的item-item协同过滤算法进行预测...
测试结果的rmse为 0.9344



In [22]:
print('------ Top-k协同过滤(item-item, baseline, 矫正)------')
def predict_topkCF(user, item, k=10):
    '''top-k CF算法,以item-item协同过滤为基础，结合baseline,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    choice = nzero[item_similarity[item, nzero].argsort()[::-1][:k]]
    prediction = (ratings[user, choice] - baseline[choice]).dot(item_similarity[item, choice])\
                / sum(item_similarity[item, choice]) + baseline[item]
    if prediction > 5: prediction = 5
    if prediction < 1: prediction = 1
    return prediction 

print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用top K协同过滤算法进行预测...')
k = 20
print('选取的K值为%d.' % k)
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_topkCF(user, item, k))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

------ Top-k协同过滤(item-item, baseline, 矫正)------
载入测试集...
测试集大小为 20000
采用top K协同过滤算法进行预测...
选取的K值为20.
测试结果的rmse为 0.9181



In [23]:
print('经检验，在100k数据上，K=20为佳.')

经检验，在100k数据上，K=20为佳.


In [24]:
print('------ baseline + item-item + 矫正 + TopK + 归一化矩阵 ------')

------ baseline + item-item + 矫正 + TopK + 归一化矩阵 ------


In [25]:
def cal_similarity_norm(ratings, kind, epsilon=1e-9):
    '''采用归一化的指标:Pearson correlation coefficient'''
    if kind == 'user':
        # 对同一个user的打分归一化
        rating_user_diff = ratings.copy()
        for i in range(ratings.shape[0]):
            nzero = ratings[i].nonzero()
            rating_user_diff[i][nzero] = ratings[i][nzero] - user_mean[i]
        sim = rating_user_diff.dot(rating_user_diff.T) + epsilon
    elif kind == 'item':
        # 对同一个item的打分归一化
        rating_item_diff = ratings.copy()
        for j in range(ratings.shape[1]):
            nzero = ratings[:,j].nonzero()
            rating_item_diff[:,j][nzero] = ratings[:,j][nzero] - item_mean[j]
        sim = rating_item_diff.T.dot(rating_item_diff) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

print('计算归一化的相似度矩阵...')
user_similarity_norm = cal_similarity_norm(ratings, kind='user')
item_similarity_norm = cal_similarity_norm(ratings, kind='item')
print('计算完成.')
print('相似度矩阵样例: (item-item)')
print(np.round_(item_similarity_norm[:10,:10], 3))

计算归一化的相似度矩阵...
计算完成.
相似度矩阵样例: (item-item)
[[ 1.     0.053  0.055  0.028  0.125  0.046  0.051  0.07   0.039  0.022]
 [ 0.053  1.     0.021  0.122  0.021 -0.007  0.052  0.109 -0.061  0.051]
 [ 0.055  0.021  1.    -0.035  0.013  0.048 -0.011 -0.003 -0.048  0.044]
 [ 0.028  0.122 -0.035  1.    -0.008 -0.028  0.053  0.087  0.028  0.036]
 [ 0.125  0.021  0.013 -0.008  1.    -0.011  0.104  0.025  0.043 -0.016]
 [ 0.046 -0.007  0.048 -0.028 -0.011  1.     0.026 -0.071  0.035  0.013]
 [ 0.051  0.052 -0.011  0.053  0.104  0.026  1.     0.051  0.143  0.025]
 [ 0.07   0.109 -0.003  0.087  0.025 -0.071  0.051  1.     0.019  0.043]
 [ 0.039 -0.061 -0.048  0.028  0.043  0.035  0.143  0.019  1.     0.005]
 [ 0.022  0.051  0.044  0.036 -0.016  0.013  0.025  0.043  0.005  1.   ]]


In [39]:
def predict_norm_CF(user, item, k=20):
    '''baseline + item-item + '''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    choice = nzero[item_similarity_norm[item, nzero].argsort()[::-1][:k]]
    prediction = (ratings[user, choice] - baseline[choice]).dot(item_similarity_norm[item, choice])\
                / sum(item_similarity_norm[item, choice]) + baseline[item]
    if prediction > 5: prediction = 5
    if prediction < 1: prediction = 1
    return prediction 

print('载入测试集...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('测试集大小为 %d' % len(test_df))
print('采用归一化矩阵方法，结合其它trick进行预测...')
k = 13
print('选取的K值为%d.' % k)
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_norm_CF(user, item, k))
    targets.append(actual)

print('测试结果的rmse为 %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

载入测试集...
测试集大小为 20000
采用归一化矩阵方法，结合其它trick进行预测...
选取的K值为13.
测试结果的rmse为 0.9200



In [None]:
print('------ 测试Top K ------')