# Chapter02 提供推荐
## 协作型过滤
## 搜集偏好

In [1]:
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'Superman Returns': 3.5,
        'You, Me and Dupree': 2.5,
        'The Night Listener': 3.0
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 3.5
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.0,
        'Superman Returns': 3.5,
        'The Night Listener': 4.0
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'The Night Listener': 4.5,
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'Just My Luck': 2.0,
        'Superman Returns': 3.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0
    },
    'Jack Matthews': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'The Night Listener': 3.0,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5
    },
    'Toby': {
        'Snakes on a Plane': 4.5,
        'You, Me and Dupree': 1.0,
        'Superman Returns': 4.0
    }
}

In [2]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [3]:
critics['Lisa Rose']['Lady in the Water'] = 3.5
critics['Lisa Rose']

{'Lady in the Water': 3.5,
 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0,
 'Superman Returns': 3.5,
 'You, Me and Dupree': 2.5,
 'The Night Listener': 3.0}

In [4]:
critics['Lisa Rose']['Lady in the Water'] = 2.5
critics['Lisa Rose']

{'Lady in the Water': 2.5,
 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0,
 'Superman Returns': 3.5,
 'You, Me and Dupree': 2.5,
 'The Night Listener': 3.0}

## 寻找相近的用户
- 欧几里得距离
- 皮尔逊相关度

### 欧几里得距离

$d = \sqrt{\sum_{i=1}^{n} (x_i-y_i)^2 }$，那么，$\frac{  1}{ 1 + d }$ 的值越接近 1，则两个用户越相近。

In [59]:
from math import sqrt


# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs, person1, person2):
    # Get the list of shared_items
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1

    # if they have no ratings in common, return 0
    if len(si) == 0: return 0
    
    # Add up the squares of all the differences
    sum_of_squares = sum(
        [pow(prefs[person1][item] - prefs[person2][item], 2) for item in si])
    return 1 / (1 + sqrt(sum_of_squares))

In [60]:
d = sim_distance(critics, 'Lisa Rose', 'Gene Seymour')
print(d)

0.29429805508554946


In [61]:
d = sim_distance(critics, 'Lisa Rose', 'Claudia Puig')
print(d)

0.38742588672279304


### 皮尔逊相关度评价
其值介于 `-1` 与 `1` 之间,其中, `1` 表示变量完全正相关, `0` 表示无关,`-1` 表示完全负相关。负相关关系是指两个现象的变化方向相反，也可理解成事态发展的对立关系的这一概念。

负相关关系用通俗的话说，负相关就是两个量，其中一个变大时，另一个就变小，一个变小时，另一个就变大，反之亦然。

$$r= \frac {\sum X Y - \frac{\sum X  \sum Y}{N}}{\sqrt{(\sum X^2 - \frac{(\sum X)^2}{N}) (\sum Y^2 - \frac{(\sum Y)^2}{N})}}$$

In [62]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs, p1, p2):
    #     Get the list of mutually rated items
    si = {}

    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1

    n = len(si)
    #      if they are no ratings in common, return 0
    if n == 0: return 0

    # Sums of all the preferences
    sum1 = sum([prefs[p1][item] for item in si])
    sum2 = sum([prefs[p2][item] for item in si])

    # Sums of square
    sum1Sq = sum([pow(prefs[p1][item], 2) for item in si])
    sum2Sq = sum([pow(prefs[p2][item], 2) for item in si])

    # Sum of the products
    pSum = sum([prefs[p1][item] * prefs[p2][item] for item in si])

    # Calculate r (Pearson score)
    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0: return 0

    r = num / den

    return r

In [63]:
person_distance = sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')
print(person_distance)

0.39605901719066977


In [64]:
person_distance = sim_pearson(critics, 'Lisa Rose', 'Claudia Puig')
print(person_distance)

0.5669467095138396


### 其他算法
- Jaccard 系数
- 曼哈顿距离算法

了解其他用于比较的度量算法：https://en.wikipedia.org/wiki/Metric_%28mathematics%29#Examples

### 为评论者打分

In [96]:
def topMatches(prefs, person, n=5, similarity=sim_pearson):
    scores = [(similarity(prefs, person, other), other) for other in prefs
              if other != person]

    scores.sort()
    scores.reverse()
    return scores[0:n]

In [97]:
topMatches(critics, 'Toby', n=3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

## 推荐物品

In [75]:
def getRecommendations(prefs, person, similarity=sim_pearson):
    totals = {}
    simSums = {}
    for other in prefs:
        # 不和自己做比较
        if other == person: continue
        sim = similarity(prefs, person, other)

        if sim <= 0: continue
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item] == 0:
                totals.setdefault(item, 0)
                totals[item] += prefs[other][item] * sim

                simSums.setdefault(item, 0)
                simSums[item] += sim

    rankings = [(total / simSums[item], item) for item, total in totals.items()]

    rankings.sort()
    rankings.reverse()
    return rankings

In [76]:
getRecommendations(critics,'Toby')

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [77]:
getRecommendations(critics,'Toby',similarity=sim_distance)

[(3.457128694491423, 'The Night Listener'),
 (2.778584003814924, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]

## 匹配商品
先转换字典，将
```
{'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5}}
```
转换成：
```
{'Lady in the Water':{'Lisa Rose':2.5,'Gene Seymour':3.0},
'Snakes on a Plane':{'Lisa Rose':3.5,'Gene Seymour':3.5}} etc..
```

In [84]:
def transformPrefs(prefs):
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            result[item][person] = prefs[person][item]
    return result

In [87]:
movies = transformPrefs(critics)
movies

{'Lady in the Water': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.0,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0},
 'Snakes on a Plane': {'Lisa Rose': 3.5,
  'Gene Seymour': 3.5,
  'Michael Phillips': 3.0,
  'Claudia Puig': 3.5,
  'Mick LaSalle': 4.0,
  'Jack Matthews': 4.0,
  'Toby': 4.5},
 'Just My Luck': {'Lisa Rose': 3.0,
  'Gene Seymour': 1.5,
  'Claudia Puig': 3.0,
  'Mick LaSalle': 2.0},
 'Superman Returns': {'Lisa Rose': 3.5,
  'Gene Seymour': 5.0,
  'Michael Phillips': 3.5,
  'Claudia Puig': 4.0,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 5.0,
  'Toby': 4.0},
 'You, Me and Dupree': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.5,
  'Claudia Puig': 2.5,
  'Mick LaSalle': 2.0,
  'Jack Matthews': 3.5,
  'Toby': 1.0},
 'The Night Listener': {'Lisa Rose': 3.0,
  'Gene Seymour': 3.0,
  'Michael Phillips': 4.0,
  'Claudia Puig': 4.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0}}

然后调用 `topMatches` 函数，得到一组与 *Superman Returns* 最为相近的影片：

In [89]:
topMatches(movies, 'Superman Returns')

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

我们也可以为影片推荐评论者：

In [90]:
getRecommendations(movies, 'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

# 构建一个基于 del.icio.us 的链接推荐系统
> http://del.icio.us

[下载 pydelicious](https://code.google.com/archive/p/pydelicious/downloads)安装，这是一个 python2 的版本，在 python3 下，需要用这个 `__init__py` [下载](http://storage.googleapis.com/google-code-attachments/pydelicious/issue-43/comment-12/__init__.py) 覆盖 `pydelicious/__init__.py`。

In [None]:
import pydelicious
pydelicious.get_popular(tag = 'programming')