In [295]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

In [296]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('dataset/ml-100k/u1.base', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [297]:
# n_users = df.user_id.unique().shape[0]
# n_items = df.item_id.unique().shape[0]
n_users = 943
n_items = 1682
print(str(n_users) + ' users')
print(str(n_items) + ' items')
print(str(len(df)) + ' ratings')

943 users
1682 items
80000 ratings


In [298]:
ratings = np.zeros((n_users, n_items))
print('载入测试集...')
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
print('载入完成.')

(943, 1682)


In [300]:
# 计算矩阵密度
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('测试集矩阵密度为: {:4.2f}%'.format(sparsity))

矩阵密度为: 5.04%


In [303]:
# 较慢的版本，表述了CF的计算方式
def slow_similarity(ratings, kind):
    if kind == 'user':
        axmax = 0
        axmin = 1
    elif kind == 'item':
        axmax = 1
        axmin = 0
    sim = np.zeros((ratings.shape[axmax], ratings.shape[axmax]))
    for u in range(ratings.shape[axmax]):
        for uprime in range(ratings.shape[axmax]):
            rui_sqrd = 0.
            ruprimei_sqrd = 0.
            for i in range(ratings.shape[axmin]):
                sim[u, uprime] = ratings[u, i] * ratings[uprime, i]
                rui_sqrd += ratings[u, i] ** 2
                ruprimei_sqrd += ratings[uprime, i] ** 2
            sim[u, uprime] /= rui_sqrd * ruprimei_sqrd
    return sim

# 向量化版本计算corr, 也即cosine距离
def fast_similarity(ratings, kind, epsilon=1e-9):
    # epsilon: small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

# 带归一化的相似度
def fast_similarity_2(ratings, kind, epsilon=1e-9):
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])

    return (sim / norms / norms.T)

In [304]:
train = ratings
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')
print('user similarity: \n', user_similarity.shape)
print()
print('item similarity: \n', item_similarity.shape)

user similarity: 
 (943, 943)

item similarity: 
 (1682, 1682)


In [305]:
# Test CF time
# %timeit slow_similarity(train, kind='user')
# %timeit fast_similarity(train, kind='user')

In [306]:
k = 100

# 挑出最相近的 打过分的！！ top k item
def predict(user, item, k=100):
    ratings_nonzero = ratings[user].nonzero()[0]
    prediction = ratings[user,ratings_nonzero].dot(item_similarity[item, ratings_nonzero])\
                / sum(item_similarity[item, ratings_nonzero])
    return prediction 


In [312]:
def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [314]:
test_df = pd.read_csv('dataset/ml-100k/u1.test', sep='\t', names=names)
test_df.head()
predictions = []
targets = []
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]-1
    predictions.append(predict(user, item))
    targets.append(actual)

print(get_rmse(np.array(predictions), np.array(targets)))

1.30110140047
