In [38]:
import pandas as pd
import numpy as np
# import predict

print('初始化变量...')
names = ['user_id', 'item_id', 'rating', 'timestamp']
direct = 'dataset/ml-100k/'
trainingset_files = (direct + name for name in ('u1.base', 'u2.base', 'u3.base', 'u4.base', 'u5.base'))
testset_files = (direct + name for name in ('u1.test', 'u2.test', 'u3.test', 'u4.test', 'u5.test'))
n_users = 943
n_items = 1682

def cal_sparsity():
    sparsity = float(len(ratings.nonzero()[0]))
    sparsity /= n_users * n_items
    sparsity *= 100
    print('训练集矩阵密度为 {:4.2f}%'.format(sparsity))

def rmse(pred, actual):
    '''计算预测结果的rmse'''
    from sklearn.metrics import mean_squared_error
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

def cal_mean():
    '''Calculate mean value'''
    print('计算总体均值，各user打分均值，各item打分均值...')
    global all_mean, user_mean, item_mean
    all_mean = np.mean(ratings[ratings!=0])
    user_mean = sum(ratings.T) / sum((ratings!=0).T)
    item_mean = sum(ratings) / sum((ratings!=0))
    user_mean = np.where(np.isnan(user_mean), all_mean, user_mean)
    item_mean = np.where(np.isnan(item_mean), all_mean, item_mean)
    
def cal_similarity(ratings, kind, epsilon=1e-9):
    '''利用Cosine距离计算相似度'''
    '''epsilon: 防止Divide-by-zero错误，进行矫正'''
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

def predict_baseline(user, item):
    prediction = item_mean[item] + user_mean[user] - all_mean
    return prediction

def predict_itemCF(user, item, k=100):
    '''item-item协同过滤算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    prediction = ratings[user, nzero].dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero])
    return prediction

def predict_itemCF_baseline(user, item, k=100):
    '''结合baseline的item-item CF算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item]
    return prediction 

def predict_userCF_baseline(user, item, k=100):
    '''结合baseline的user-user协同过滤算法,预测rating'''
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = (ratings[nzero, item] - baseline[nzero]).dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero]) + baseline[user]
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction

def predict_biasCF(user, item, k=100):
    '''结合baseline的item-item CF算法,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item]
    if prediction > 5:
        prediction = 5
    if prediction < 1:
        prediciton = 1
    return prediction

def predict_topkCF_item(user, item, k=20):
    '''top-k CF算法,以item-item协同过滤为基础，结合baseline,预测rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    choice = nzero[item_similarity[item, nzero].argsort()[::-1][:k]]
    prediction = (ratings[user, choice] - baseline[choice]).dot(item_similarity[item, choice])\
                / sum(item_similarity[item, choice]) + baseline[item]
    if prediction > 5: prediction = 5
    if prediction < 1: prediction = 1
    return prediction

def predict_topkCF_user(user, item, k=20):
    '''top-k CF算法,以user-user协同过滤为基础，结合baseline,预测rating'''    
    nzero = ratings[:,item].nonzero()[0]
    choice = nzero[user_similarity[user, nzero].argsort()[::-1][:k]]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = (ratings[choice, item] - baseline[choice]).dot(user_similarity[user, choice])\
                / sum(user_similarity[user, choice]) + baseline[user]
    if np.isnan(prediction):
        prediction = baseline[user]
    if prediction > 5: prediction = 5
    if prediction < 1: prediction = 1
    return prediction

def predict_blend(user, item, k1=20, k2=20):
    prediction1 = predict_topkCF(user, item, k1)
    prediction2 = predict_topkCF_user(user, item, k2)
    prediction = (prediction1 + prediction2) / 2
    if prediction > 5: prediction = 5
    if prediction < 1: prediction = 1
    return prediction

if __name__ == '__main__':
    
    method = {'baseline', 'itemCF', 'itemCF_baseline', 'userCF_baseline', 'itemCF_bias', 'topkCF_item', 'topkCF_user'}
    rmse_baseline = []
    rmse_itemCF = []
    rmse_itemCF_baseline = []
    rmse_userCF_baseline = []
    rmse_itemCF_bias = []
    rmse_topkCF_item = []
    rmse_topkCF_user = []
    rmse_blend = []
    i = 0
    nums = 5
    for trainingset_file, testset_file in zip(trainingset_files, testset_files):
        i += 1
        print('------ 第%d/%d组样本 ------' % (i, nums))
        df = pd.read_csv(trainingset_file, sep='\t', names=names)
        
        ratings = np.zeros((n_users, n_items))
        print('载入训练集' + trainingset_file + '...')
        for row in df.itertuples():
            ratings[row[1]-1, row[2]-1] = row[3]
        
        print('载入完成.')
        print('打分矩阵规模为 %d*%d.' % (n_users, n_items))
        print('训练集有效打分个数为 %d.' % len(df))

        cal_sparsity()
        print('计算训练集各项统计数据...')
        cal_mean()

        print('计算相似度矩阵...')
        user_similarity = cal_similarity(ratings, kind='user')
        item_similarity = cal_similarity(ratings, kind='item')
        print('计算完成.')
        
        print('载入测试集' + testset_file + '...')
        test_df = pd.read_csv(testset_file, sep='\t', names=names)
        test_df.head()
        predictions_baseline = []
        predictions_itemCF = []
        predictions_itemCF_baseline = []
        predictions_userCF_baseline = []
        predictions_itemCF_bias = []
        predictions_topkCF_item = []
        predictions_topkCF_user = []
        predictions_blend = []
        targets = []
        print('测试集大小为 %d' % len(test_df))
        print('测试中...')
        for row in test_df.itertuples():
            user, item, actual = row[1]-1, row[2]-1, row[3]
            predictions_baseline.append(predict_baseline(user, item))
            predictions_itemCF.append(predict_itemCF(user, item))
            predictions_itemCF_baseline.append(predict_itemCF_baseline(user, item))
            predictions_userCF_baseline.append(predict_userCF_baseline(user, item))
            predictions_itemCF_bias.append(predict_biasCF(user, item))
            predictions_topkCF_item.append(predict_topkCF(user, item, 20))
            predictions_topkCF_user.append(predict_topkCF_user(user, item, 20))
            predictions_blend.append(predict_blend(user, item, 20, 20))
            targets.append(actual)
    
        rmse_baseline.append(rmse(np.array(predictions_baseline), np.array(targets)))
        rmse_itemCF.append(rmse(np.array(predictions_itemCF), np.array(targets)))
        rmse_itemCF_baseline.append(rmse(np.array(predictions_itemCF_baseline), np.array(targets)))
        rmse_userCF_baseline.append(rmse(np.array(predictions_userCF_baseline), np.array(targets)))
        rmse_itemCF_bias.append(rmse(np.array(predictions_itemCF_bias), np.array(targets)))
        rmse_topkCF_item.append(rmse(np.array(predictions_topkCF_item), np.array(targets)))
        rmse_topkCF_user.append(rmse(np.array(predictions_topkCF_user), np.array(targets)))
        rmse_blend.append(rmse(np.array(predictions_blend), np.array(targets)))
        print('测试完成.')
    print('------ 测试结果 ------')
    print('各方法在交叉验证下的RMSE值:')
    print('baseline:        %.4f' % np.mean(rmse_baseline))
    print('itemCF:          %.4f' % np.mean(rmse_itemCF))
    print('itemCF_baseline: %.4f' % np.mean(rmse_itemCF_baseline))
    print('userCF_baseline: %.4f' % np.mean(rmse_userCF_baseline)) 
    print('biasCF:          %.4f' % np.mean(rmse_itemCF_bias))
    print('topkCF(item):    %.4f' % np.mean(rmse_topkCF_item))
    print('topkCF(user):    %.4f' % np.mean(rmse_topkCF_user))
    print('blend topkCF:    %.4f' % np.mean(rmse_blend))
    print('交叉验证运行完成.')
    

初始化变量...
------ 第1/5组样本 ------
载入训练集dataset/ml-100k/u1.base...
载入完成.
打分矩阵规模为 943*1682.
训练集有效打分个数为 80000.
训练集矩阵密度为 5.04%
计算训练集各项统计数据...
计算总体均值，各user打分均值，各item打分均值...
计算相似度矩阵...
计算完成.
载入测试集dataset/ml-100k/u1.test...
测试集大小为 20000
测试中...
测试完成.
------ 第2/5组样本 ------
载入训练集dataset/ml-100k/u2.base...
载入完成.
打分矩阵规模为 943*1682.
训练集有效打分个数为 80000.
训练集矩阵密度为 5.04%
计算训练集各项统计数据...
计算总体均值，各user打分均值，各item打分均值...
计算相似度矩阵...
计算完成.
载入测试集dataset/ml-100k/u2.test...
测试集大小为 20000
测试中...
测试完成.
------ 第3/5组样本 ------
载入训练集dataset/ml-100k/u3.base...
载入完成.
打分矩阵规模为 943*1682.
训练集有效打分个数为 80000.
训练集矩阵密度为 5.04%
计算训练集各项统计数据...
计算总体均值，各user打分均值，各item打分均值...
计算相似度矩阵...
计算完成.
载入测试集dataset/ml-100k/u3.test...
测试集大小为 20000
测试中...
测试完成.
------ 第4/5组样本 ------
载入训练集dataset/ml-100k/u4.base...
载入完成.
打分矩阵规模为 943*1682.
训练集有效打分个数为 80000.
训练集矩阵密度为 5.04%
计算训练集各项统计数据...
计算总体均值，各user打分均值，各item打分均值...
计算相似度矩阵...
计算完成.
载入测试集dataset/ml-100k/u4.test...
测试集大小为 20000
测试中...
测试完成.
------ 第5/5组样本 ------
载入训练集dataset/ml-100k/u5.base...
载入完成.
打分矩阵规模为