##  ml-100数据推荐系统

[在Python中实现你自己的推荐系统](http://python.jobbole.com/85516/)

In [22]:
import numpy as np
import pandas as pd
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('../../ml-100k/u.data', sep='\t', names=header)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'users: ',n_users,"items: ",n_items

 users:  943 items:  1682


In [10]:
##  交叉验证
from sklearn import cross_validation as cv
train_data,test_data = cv.train_test_split(df,test_size=0.25)

In [11]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  
 
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [12]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [13]:
def predict(ratings, similarity, type='user'):
        if type == 'user':
            mean_user_rating = ratings.mean(axis=1)
            #You use np.newaxis so that mean_user_rating has same format as ratings
            ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
            pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        elif type == 'item':
            pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
        return pred

In [14]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')


In [18]:
item_prediction[0]

array([ 0.34629886,  0.37165235,  0.38690261, ...,  0.42137468,
        0.41393078,  0.40957052])

In [19]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds
 
    #get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

In [20]:
X_pred

array([[  4.51830002e+00,   1.21027891e+00,   8.45863911e-01, ...,
         -1.46622664e-02,   1.84068531e-02,   4.65702838e-02],
       [  1.42278497e+00,   2.32239511e-03,  -1.11602134e-01, ...,
          1.11767818e-02,  -8.01510197e-03,  -3.05926418e-02],
       [  1.92234125e-01,   2.79350568e-02,   3.85556588e-02, ...,
          1.09277034e-02,   8.66714925e-03,   1.23742736e-03],
       ..., 
       [  8.51427463e-01,  -7.49046934e-03,   2.15738749e-01, ...,
         -3.43667366e-03,   8.89345035e-03,   6.21499406e-03],
       [  5.87466525e-01,  -8.71041464e-02,  -1.28388485e-01, ...,
          1.66875917e-02,  -9.85291094e-03,  -2.36870092e-02],
       [  1.21778647e+00,   1.48694713e+00,   7.49324306e-01, ...,
         -1.29226001e-02,  -4.28752081e-03,   2.30148008e-02]])