## 利用Python实现SVD

In [1]:
from numpy import *

In [2]:
U,Sigma,VT = linalg.svd([[1,1],[7,7]])

In [8]:
U

array([[-0.14142136, -0.98994949],
       [-0.98994949,  0.14142136]])

In [10]:
Sigma
# Sigma其实是一个矩阵，只是numpy在内部实现过程中由于得到的是一个对角阵，因此直接将对角线元素保存起来即可

array([1.00000000e+01, 2.82797782e-16])

In [11]:
VT

array([[-0.70710678, -0.70710678],
       [ 0.70710678, -0.70710678]])

## 对矩阵进行SVD分解

In [12]:
def loadExdata():
    return [
        [1, 1, 1, 0, 0],
        [2, 2, 2, 0, 0],
        [1, 1, 1, 0, 0],
        [5, 5, 5, 0, 0],
        [1, 1, 0, 2, 2],
        [0, 0, 0, 3, 3],
        [0, 0, 0, 1, 1]
    ]

In [13]:
Data = loadExdata()

In [14]:
U,Sigma,VT = linalg.svd(Data)

In [15]:
Sigma

array([9.72140007e+00, 5.29397912e+00, 6.84226362e-01, 4.11502614e-16,
       1.36030206e-16])

In [16]:
# 重构原始结构的近似矩阵

In [18]:
Sig3 = mat([[Sigma[0],0,0],[0,Sigma[1],0],[0,0,Sigma[2]]])

In [19]:
Sig3

matrix([[9.72140007, 0.        , 0.        ],
        [0.        , 5.29397912, 0.        ],
        [0.        , 0.        , 0.68422636]])

In [20]:
U[:,:3] * Sig3 * VT[:3,:]

matrix([[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
          7.75989921e-16,  7.71587483e-16],
        [ 2.00000000e+00,  2.00000000e+00,  2.00000000e+00,
          3.00514919e-16,  2.77832253e-16],
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
          2.18975112e-16,  2.07633779e-16],
        [ 5.00000000e+00,  5.00000000e+00,  5.00000000e+00,
          3.00675663e-17, -1.28697294e-17],
        [ 1.00000000e+00,  1.00000000e+00, -5.48397422e-16,
          2.00000000e+00,  2.00000000e+00],
        [ 3.21319929e-16,  4.43562065e-16, -3.48967188e-16,
          3.00000000e+00,  3.00000000e+00],
        [ 9.71445147e-17,  1.45716772e-16, -1.52655666e-16,
          1.00000000e+00,  1.00000000e+00]])

## 相似度计算

In [21]:
# 相似度计算
from numpy import *
from numpy import linalg as la
# 欧式空间相似度
def eulidSim(inA,inB):
    # linalg.norm是Python中进项范数计算的方法
    return 1.0 / (1.0 + la.norm(inA - inB))
# 皮尔逊相关系数
def pearsSim(inA,inB):
    if len(inA) < 3:
        return 1.0
    return 0.5 + 0.5 * corrcoef(inA,inB,rowvar = 0)[0][1]
# 余弦相似度
def cosSim(inA,inB):
    num = float(inA.T * inB)
    denom = la.norm(inA) * la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

In [23]:
myMat = mat(loadExdata())

In [24]:
eulidSim(myMat[:,0],myMat[:,4])

0.13367660240019172

In [25]:
eulidSim(myMat[:,0],myMat[:,0])

1.0

In [26]:
cosSim(myMat[:,0],myMat[:,4])

0.5472455591261534

In [28]:
cosSim(myMat[:,0],myMat[:,0])

0.9999999999999999

In [30]:
pearsSim(myMat[:,0],myMat[:,4])

0.23768619407595815

In [31]:
pearsSim(myMat[:,0],myMat[:,0])

1.0

In [32]:
myMat

matrix([[1, 1, 1, 0, 0],
        [2, 2, 2, 0, 0],
        [1, 1, 1, 0, 0],
        [5, 5, 5, 0, 0],
        [1, 1, 0, 2, 2],
        [0, 0, 0, 3, 3],
        [0, 0, 0, 1, 1]])

In [37]:
eulidSim(myMat[:,0],myMat[:,4])

0.13367660240019172

In [38]:
cosSim(myMat[:,0],myMat[:,4])

0.5472455591261534

In [39]:
pearsSim(myMat[:,0],myMat[:,4])

0.23768619407595815

## 基于相似度的推荐引擎

In [58]:
# 基于物品相似度的推荐引擎
def standEst(dataMat,user,simMeas,item):
    n = shape(dataMat)[1]
    simTotal = 0.0
    ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0:
            continue
        # 逻辑与运算
        overLap = nonzero(logical_and(dataMat[:,item].A > 0,dataMat[:,j].A > 0))[0]
        if len(overLap) == 0:
            similarity = 0
        else:
            similarity = simMeas(dataMat[overLap,item],dataMat[overLap,j])
        print("the %d and %d similarity is: %f" %(item,j,similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal/simTotal
def recommand(dataMat,user,N = 3,simMeas = cosSim,estMethod = standEst):
    unratedItems = nonzero(dataMat[user,:].A == 0)[1]
    if len(unratedItems) == 0:
        return "you rated everything"
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat,user,simMeas,item)
        itemScores.append((item,estimatedScore))
    return sorted(itemScores,key = lambda jj: jj[1],reverse = True)[:N]

In [59]:
myMat = mat(loadExdata())

In [60]:
myMat[0,1] = myMat[0,0] = myMat[1,0] = myMat[2,0] = 4
myMat[3,3] = 2

In [61]:
myMat

matrix([[4, 4, 1, 0, 0],
        [4, 2, 2, 0, 0],
        [4, 1, 1, 0, 0],
        [5, 5, 5, 2, 0],
        [1, 1, 0, 2, 2],
        [0, 0, 0, 3, 3],
        [0, 0, 0, 1, 1]])

In [62]:
recommand(myMat,2)

the 3 and 0 similarity is: 0.916025
the 3 and 1 similarity is: 0.916025
the 3 and 2 similarity is: 1.000000
the 4 and 0 similarity is: 1.000000
the 4 and 1 similarity is: 1.000000
the 4 and 2 similarity is: 0.000000


[(4, 2.5), (3, 1.9703483892927431)]

In [63]:
# 使用不同的计算方式进行计算
recommand(myMat,2,simMeas=eulidSim)

the 3 and 0 similarity is: 0.240253
the 3 and 1 similarity is: 0.240253
the 3 and 2 similarity is: 0.250000
the 4 and 0 similarity is: 0.500000
the 4 and 1 similarity is: 0.500000
the 4 and 2 similarity is: 0.000000


[(4, 2.5), (3, 1.98665729687295)]

In [64]:
recommand(myMat,2,simMeas=pearsSim)

the 3 and 0 similarity is: 1.000000
the 3 and 1 similarity is: 1.000000
the 3 and 2 similarity is: 1.000000
the 4 and 0 similarity is: 1.000000
the 4 and 1 similarity is: 1.000000
the 4 and 2 similarity is: 0.000000


[(4, 2.5), (3, 2.0)]

## 利用SVD提高推荐的效果

In [65]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

In [66]:
U,Sigma,VT = la.svd(mat(loadExData2()))

In [70]:
Sigma

array([15.77075346, 11.40670395, 11.03044558,  4.84639758,  3.09292055,
        2.58097379,  1.00413543,  0.72817072,  0.43800353,  0.22082113,
        0.07367823])

In [71]:
# 看看有多少奇异值能达到总能量的百分之90
Sig2 = Sigma ** 2

In [72]:
Sig2

array([2.48716665e+02, 1.30112895e+02, 1.21670730e+02, 2.34875695e+01,
       9.56615756e+00, 6.66142570e+00, 1.00828796e+00, 5.30232598e-01,
       1.91847092e-01, 4.87619735e-02, 5.42848136e-03])

In [73]:
sum(Sig2)

541.9999999999995

In [76]:
# 计算总能量的百分之90
sum(Sig2)*0.9

487.7999999999996

In [77]:
# 计算前两个元素所包含的能量
sum(Sig2[:2])
# 该值低于百分之90

378.8295595113579

## 能量集中

In [80]:
# 计算前三个元素所包含的能量
sum(Sig2[:3])
# 该值高于百分之90

500.5002891275793

## 基于SVD的估计

In [83]:
# 基于SVD的评分估计
from numpy import linalg as la
def svdEst(dataMat,user,simMeas,item):
    n = shape(dataMat)[1]
    simTotal = 0.0
    ratSimTotal = 0.0
    U,Sigma,VT = la.svd(dataMat)
    Sig4 = mat(eye(4) * Sigma[:4])
    xformedItems = dataMat.T * U[:,:4] * Sig4.I
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j == item:
            continue
        similarity = simMeas(xformedItems[item,:].T,xformedItems[j,:].T)
        print("the %d and %d Similarity is: %f" %(item,j,similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal

In [84]:
recommand(myMat,1,estMethod = svdEst)

the 3 and 0 Similarity is: 0.441210
the 3 and 1 Similarity is: 0.523799
the 3 and 2 Similarity is: 0.650061
the 4 and 0 Similarity is: 0.561288
the 4 and 1 Similarity is: 0.475190
the 4 and 2 Similarity is: 0.343564


[(4, 2.813435807030927), (3, 2.5463659978758417)]

In [85]:
recommand(myMat,1,estMethod = svdEst,simMeas=pearsSim)

the 3 and 0 Similarity is: 0.074611
the 3 and 1 Similarity is: 0.602337
the 3 and 2 Similarity is: 0.641514
the 4 and 0 Similarity is: 0.136441
the 4 and 1 Similarity is: 0.576816
the 4 and 2 Similarity is: 0.241017


[(4, 2.285957188386649), (3, 2.1131786800996397)]

## 基于SVD的图像压缩

In [102]:
# 图像压缩函数
def printMat(inMat,thresh = 0.8):
    for i in range(32):
        for k in range(32):
            if float(inMat[i,k]) > thresh:
                print(1,end='')
            else:
                print(0,end='')
        print('')
def imgCompress(numSV = 3,thresh = 0.8):
    myl = []
    for line in open('0_5.txt').readlines():
        newRow = []
        for i in range(32):
            newRow.append(int(line[i]))
        myl.append(newRow)
    myMat = mat(myl)
    print("*******original matrix******")
    printMat(myMat,thresh)
    U,Sigma,VT = la.svd(myMat)
    SigRecon = mat(zeros((numSV,numSV)))
    for k in range(numSV):
        SigRecon[k,k] = Sigma[k]
    reconMat = U[:,:numSV] * SigRecon * VT[:numSV,:]
    print("****reconstructed matrix using %d singular values*****" %(numSV))
    printMat(reconMat,thresh)

In [103]:
imgCompress(2)

*******original matrix******
00000000000000110000000000000000
00000000000011111100000000000000
00000000000111111110000000000000
00000000001111111111000000000000
00000000111111111111100000000000
00000001111111111111110000000000
00000000111111111111111000000000
00000000111111100001111100000000
00000001111111000001111100000000
00000011111100000000111100000000
00000011111100000000111110000000
00000011111100000000011110000000
00000011111100000000011110000000
00000001111110000000001111000000
00000011111110000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000001111100000000011111000000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000011111000000
00000000111110000000111111000000
00000000111111000001111110000000
00000000011111111111111110000000
00000000001111111111111110000000
00000000001111111111111110000000
00000000000111