In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### svd的优缺点
    简化数据，去除噪声和冗余信息，提高算法的结果，数据的转化难以理解

基本理论知识：
  矩阵分解是将原始的矩阵表示为新的易于处理的形式，有两个或者多个矩阵相乘  
$Data = U_{matrix}\sum_{matrix} V_{matrix}^T$   
$\sum$只是一个对角矩阵，对角元素称为奇异值，为$Data*Data^T$的特征值的平方根

In [2]:
u,sigma,vt = np.linalg.svd([[1,1],[7,7]])

In [3]:
np.set_printoptions(precision=4)

In [4]:
u

array([[-0.1414, -0.9899],
       [-0.9899,  0.1414]])

In [5]:
sigma

array([  1.0000e+01,   2.8280e-16])

In [7]:
data = np.array([[1,1,1,0,0],
                [2,2,2,0,0],
                [1,1,1,0,0],
                [5,5,5,0,0],
                [1,1,0,2,2],
                [0,0,0,3,3],
                [0,0,0,1,1]])

In [8]:
u,sigma,vt = np.linalg.svd(data)

In [9]:
sigma#最后两个值太小，可能结果不相同，可以当0看待

array([  9.7214e+00,   5.2940e+00,   6.8423e-01,   4.1150e-16,   1.3603e-16])

In [10]:
#去掉sigma后面两个特别小的值，尽量保证保留的simga平方和值占总值的90%以上
sigma3 = np.mat([[sigma[0],0,0],
                [0,sigma[1],0],
                [0,0,sigma[2]]])

In [11]:
new_data = u[:,:3].dot(sigma3).dot(vt[:3,:])#完成数据SVD转化，去除噪声和冗余信息

In [12]:
new_data

matrix([[  1.0000e+00,   1.0000e+00,   1.0000e+00,   7.7599e-16,
           7.7159e-16],
        [  2.0000e+00,   2.0000e+00,   2.0000e+00,   3.0051e-16,
           2.7783e-16],
        [  1.0000e+00,   1.0000e+00,   1.0000e+00,   2.1898e-16,
           2.0763e-16],
        [  5.0000e+00,   5.0000e+00,   5.0000e+00,   3.0068e-17,
          -1.2870e-17],
        [  1.0000e+00,   1.0000e+00,  -5.4840e-16,   2.0000e+00,
           2.0000e+00],
        [  3.2132e-16,   4.4356e-16,  -3.4897e-16,   3.0000e+00,
           3.0000e+00],
        [  9.7145e-17,   1.4572e-16,  -1.5266e-16,   1.0000e+00,
           1.0000e+00]])

In [13]:
#协同过滤的推荐算法：通过用户和其他用户的数据的对比来实现推荐。方法：通过用户对物品的评分来计算物品之间相识度，不关注物品的属性。
#相似度=1/(1+距离)或者皮尔逊相关系数，取值【-1,1】，通过0.5+0.5*personr归一化，对评分的量级不敏感或者余弦显示性，取值也是【-1,1】
def ecludian_similarity(a,b):
    return 1 / (1 + np.linalg.norm((a-b),ord=2))#norm计算l2范数，参数ord定义范数类型
def pearson_similarity(a,b):
    if len(a) <3:
        return 1.0
    return 0.5 + 0.5*np.corrcoef(a,b)[0][1]
def cosin_similarity(a,b):#sklearn中有实现 from sklearn.metrics.pairwis import cosine_similarity
    return 0.5 + 0.5*float(np.dot(a,b)) / (np.linalg.norm(a)*np.linalg.norm(b))

In [14]:
data

array([[1, 1, 1, 0, 0],
       [2, 2, 2, 0, 0],
       [1, 1, 1, 0, 0],
       [5, 5, 5, 0, 0],
       [1, 1, 0, 2, 2],
       [0, 0, 0, 3, 3],
       [0, 0, 0, 1, 1]])

In [15]:
ecludian_similarity(data[:,0],data[:,4])

0.13367660240019172

In [16]:
ecludian_similarity(data[:,0],data[:,0])

1.0

In [17]:
cosin_similarity(data[:,0],data[:,4]),cosin_similarity(data[:,0],data[:,0])

(0.54724555912615336, 0.99999999999999989)

In [18]:
pearson_similarity(data[:,0],data[:,4]),pearson_similarity(data[:,0],data[:,0])

(0.23768619407595815, 1.0)

In [19]:
data

array([[1, 1, 1, 0, 0],
       [2, 2, 2, 0, 0],
       [1, 1, 1, 0, 0],
       [5, 5, 5, 0, 0],
       [1, 1, 0, 2, 2],
       [0, 0, 0, 3, 3],
       [0, 0, 0, 1, 1]])

In [20]:
#推荐系统的评价，模型已经的评分进行预测，计算预测值和真实评分的差异。
def goods_estimator(data_mat,similarity_measure,user,item):
    '''
    基于商品的评估系统
    '''
    n = data_mat.shape[1]
    similarity_sum = 0
    rank_similarity_total = 0
    for i in range(n):
        user_rank = data_mat[user,i]
        #print(user_rank)
        if user_rank == 0:
            continue#没有评分跳过
        two_rank = np.nonzero(np.logical_and(data_mat[:,item]>0,data_mat[:,i]>0))[0]#一个商品我和某些人做评价了，在看看另外一个商品
        #我和这些人是否也做了评价
        if len(two_rank) == 0:#如果都没做评价，那么这个商品和另外一个商品的的相似度为0
            similarity = 0
        else:#计算相似度
            similarity = similarity_measure(data_mat[two_rank,i],data_mat[two_rank,item])
        print('the {} adn {} similarity is:{}'.format(i,item,similarity))
        similarity_sum += similarity
        rank_similarity_total += similarity * user_rank
    if similarity_sum == 0:
        return 0
    else:
        return rank_similarity_total / similarity_sum

def recommend(data_mat,user,score_estimator,recommend_num=3,similarity_measure=cosin_similarity):  
    estimate_scores = []
    not_rank_itme = np.nonzero([data_mat[user,:] == 0])[1]#np.nonzero函数返回的是元祖对象，元祖的个数与对象维度相关
    if len(not_rank_itme) == 0:
        print('all goods you have taste!!')
    else:
        for item in not_rank_itme:
            estimate_score = score_estimator(data_mat,similarity_measure,user,item)
            estimate_scores.append((estimate_score,item))
        return sorted(estimate_scores,reverse=True)[:recommend_num]

     

In [21]:
data_mat = np.array([[4,4,0,2,2],
                    [4,0,0,3,3],
                    [4,0,0,1,1],
                    [1,1,1,2,0],
                    [2,2,2,0,0],
                    [1,1,1,0,0],
                    [5,5,5,0,0]])
recommend(data_mat,2,score_estimator=goods_estimator)

the 0 adn 1 similarity is:1.0
the 3 adn 1 similarity is:0.928746462856272
the 4 adn 1 similarity is:1.0
the 0 adn 2 similarity is:1.0
the 3 adn 2 similarity is:1.0
the 4 adn 2 similarity is:0


[(2.5, 2), (2.0243290220056256, 1)]

In [22]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
    

In [23]:
def svd_estimator(data_mat,similarity_measure,user,item):
    '''
    构建svd推荐评估函数
    '''
    n = np.array(data_mat).shape[1]
    similarity_sum = 0;rank_similarity_sum=0
    u,sigma,vt = np.linalg.svd(data_mat)
    sigma4 = np.eye(4)*sigma[:4]
    data_svd = u[:,:4].dot(sigma4).dot(vt[:4,:])
    for i in range(n):
        user_rank = data_mat[user,i]
        #print(user_rank)
        if user_rank == 0 or i==item: continue
        similarity = similarity_measure(data_svd[item],data_svd[i])
        print('the {} adn {} similarity is:{}'.format(i,item,similarity))
        similarity_sum += similarity
        rank_similarity_sum += similarity * user_rank
    if similarity_sum == 0:
        return 0
    else:
        return rank_similarity_sum / similarity_sum



In [24]:
data_mat = np.array(data_mat)

In [40]:
recommend(data_mat,4,score_estimator=svd_estimator,recommend_num=3,similarity_measure=cosin_similarity)

the 0 adn 4 similarity is:0.8653402942171653
the 1 adn 4 similarity is:0.6981114454772286
the 2 adn 4 similarity is:0.7760939509161997
the 3 adn 4 similarity is:0.8365022579871264


[(1.2633783467773074, 4)]

In [41]:
recommend(data_mat,4,score_estimator=goods_estimator,recommend_num=3,similarity_measure=cosin_similarity)

the 0 adn 3 similarity is:0.9377327693059581
the 1 adn 3 similarity is:0.928746462856272
the 2 adn 3 similarity is:1.0
the 0 adn 4 similarity is:0.9629100498862757
the 1 adn 4 similarity is:1.0
the 2 adn 4 similarity is:0


[(2.0, 4), (2.0, 3)]

In [45]:
np.zeros([3,3])

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])