In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)


# SVD(Singular value decomposition)
SVD同PCA也是对数据进行降维，PCA旨在寻求最大方差来储存最多的信息，所以使用对角化让PCA后的数据协方差为对角线。不同于PCA的特征值分解，SVD利用奇异值分解，好处是不需要方阵作为输入，可以把SVD想成是非方阵的特征值分解，其中的奇异值就是特征值，选最大的几个做线性变换，我们定义SVD
$$
X=U\sum V^T
$$
其中$U,V$是正交矩阵而$\sum $是对角矩阵，$\sum $对角线上的值就是$X$的奇异值
### 优缺点

优点：降低数据的复杂度、移除噪声、可能会提升模型算法精准度  
缺点：经过转换过的数据可能难以理解


接下来我们用代码介绍SVD

In [11]:
U,sigma,vT=np.linalg.svd([[1,1],[7,7]])
U

array([[-0.14142136, -0.98994949],
       [-0.98994949,  0.14142136]])

In [12]:
sigma

array([10.,  0.])

In [13]:
vT

array([[-0.70710678, -0.70710678],
       [-0.70710678,  0.70710678]])

In [15]:
U*np.matrix([[10,0],[0,0]])*vT

matrix([[1., 1.],
        [7., 7.]])

In [31]:
def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]
data=loadExData()
U,sigma,vT=np.linalg.svd(data)
sigma

array([9.64365076e+00, 5.29150262e+00, 8.36478329e-16, 6.91811207e-17,
       1.11917251e-33])

In [38]:
sig3=np.mat(np.eye(3)*sigma[:3,np.newaxis])
sig3

matrix([[9.64365076e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 5.29150262e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 8.36478329e-16]])

In [39]:
np.rint(U[:,:3]*sig3*vT[:3,:])

matrix([[ 0.,  0.,  0.,  2.,  2.],
        [-0.,  0.,  0.,  3.,  3.],
        [-0.,  0.,  0.,  1.,  1.],
        [ 1.,  1.,  1.,  0.,  0.],
        [ 2.,  2.,  2.,  0.,  0.],
        [ 5.,  5.,  5., -0., -0.],
        [ 1.,  1.,  1.,  0.,  0.]])

介绍完了SVD，我们要来介绍一种推荐算法-协同过滤算法

In [44]:
def ecludSim(inA,inB):
    return 1.0/(1.0 + np.linalg.norm(inA - inB))

def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5+0.5*np.corrcoef(inA, inB, rowvar = 0)[0][1]

def cosSim(inA,inB):
    num = float(inA.T*inB)
    denom = np.linalg.norm(inA)*np.linalg.norm(inB)
    return 0.5+0.5*(num/denom)
data=np.mat(loadExData())
ecludSim(data[:,0],data[:,4]),ecludSim(data[:,0],data[:,0])

(0.12973190755680383, 1.0)

In [45]:
pearsSim(data[:,0],data[:,4]),pearsSim(data[:,0],data[:,0])

(0.20596538173840329, 1.0)

In [46]:
cosSim(data[:,0],data[:,4]),cosSim(data[:,0],data[:,0])

(0.5, 1.0)

我们使用欧拉距离、皮尔森相似、余弦相似来度量item与item的相似度，接下来，我们使用餐厅菜色推荐系统来举例，推荐系统可能有如下流程：根据一位user，回传user可能会喜欢的菜色给他，在代码层面的话
1. 先找user尚未评分的菜色
2. 预测这些未评分的菜色user会给几分
3. 对分数排序进行推荐

In [52]:
def standEst(dataMat, user, simMeas, item):
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: continue
        overLap = np.nonzero(np.logical_and(dataMat[:,item].A>0, \
                                      dataMat[:,j].A>0))[0]
        if len(overLap) == 0: similarity = 0
        else: similarity = simMeas(dataMat[overLap,item], \
                                   dataMat[overLap,j])
        print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    unratedItems = np.nonzero(dataMat[user,:].A==0)[1]#find unrated items 
    print(unratedItems)
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        itemScores.append((item, estimatedScore))
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]

data=np.mat(loadExData())
data[0,1]=data[0,0]=data[1,0]=data[2,0]=4
data[3,3]=2
print(data)
recommend(data,2)

[[4 4 0 2 2]
 [4 0 0 3 3]
 [4 0 0 1 1]
 [1 1 1 2 0]
 [2 2 2 0 0]
 [5 5 5 0 0]
 [1 1 1 0 0]]
[1 2]
the 1 and 0 similarity is: 1.000000
the 1 and 3 similarity is: 0.928746
the 1 and 4 similarity is: 1.000000
the 2 and 0 similarity is: 1.000000
the 2 and 3 similarity is: 1.000000
the 2 and 4 similarity is: 0.000000


[(2, 2.5), (1, 2.0243290220056256)]

接下来我们使用SVD来优化推荐系统，我们用更高维度的数据来举例

In [54]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

U,sigma,vT=np.linalg.svd(np.mat(loadExData2()))
sigma

array([15.77075346, 11.40670395, 11.03044558,  4.84639758,  3.09292055,
        2.58097379,  1.00413543,  0.72817072,  0.43800353,  0.22082113,
        0.07367823])

接下来，我们来找什么特征的代表性最高

In [58]:
sig2=sigma**2
sum(sig2), sum(sig2[:2]),sum(sig2[:3]),sum(sig2[:4]),sum(sig2[:5])

(541.9999999999994,
 378.8295595113579,
 500.5002891275791,
 523.9878586377387,
 533.5540161939919)

可以看到前面四个属于比较重要的特征，所以我们可以将原本11维的数据降维成维数据

In [68]:
def svdEst(dataMat, user, simMeas, item):
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    U,Sigma,VT = np.linalg.svd(dataMat)
    Sig4 = np.mat(np.eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
    xformedItems = dataMat.T * U[:,:4] * Sig4.I  #create transformed items
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal
data=np.mat(loadExData2())
recommend(data,1,estMethod=svdEst)

[0 1 2 4 6 7 8 9]
the 0 and 3 similarity is: 0.490950
the 0 and 5 similarity is: 0.484274
the 0 and 10 similarity is: 0.512755
the 1 and 3 similarity is: 0.491294
the 1 and 5 similarity is: 0.481516
the 1 and 10 similarity is: 0.509709
the 2 and 3 similarity is: 0.491573
the 2 and 5 similarity is: 0.482346
the 2 and 10 similarity is: 0.510584
the 4 and 3 similarity is: 0.450495
the 4 and 5 similarity is: 0.506795
the 4 and 10 similarity is: 0.512896
the 6 and 3 similarity is: 0.743699
the 6 and 5 similarity is: 0.468366
the 6 and 10 similarity is: 0.439465
the 7 and 3 similarity is: 0.482175
the 7 and 5 similarity is: 0.494716
the 7 and 10 similarity is: 0.524970
the 8 and 3 similarity is: 0.491307
the 8 and 5 similarity is: 0.491228
the 8 and 10 similarity is: 0.520290
the 9 and 3 similarity is: 0.522379
the 9 and 5 similarity is: 0.496130
the 9 and 10 similarity is: 0.493617


[(4, 3.3447149384692283), (7, 3.329402072452697), (9, 3.328100876390069)]