In [2]:
import numpy as np
import pandas as pd
import scipy.spatial.distance as ssd

### 读训练集，规范化数据 

In [3]:
df_training_data = pd.read_csv(
    './data/movielen_rating_training.base',# 数据集地址
    names = ['user_id','item_id','rating'],# 起列名
    usecols = [0,1,2],# 使用第1,2,3列
    sep = '\t',# 分隔符是\t
)
df_training_data.head()

Unnamed: 0,user_id,item_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [4]:
df_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 3 columns):
user_id    80000 non-null int64
item_id    80000 non-null int64
rating     80000 non-null int64
dtypes: int64(3)
memory usage: 1.8 MB


In [6]:
user_ids = df_training_data['user_id'].unique() # 不重复的user_id
item_ids = df_training_data['item_id'].unique() # 不重复的item_id

user_quantity = len(user_ids) # 用户的数量
item_quantity = len(item_ids) # 商品的数量
print('用户数',user_quantity)
print('商品数',item_quantity)

用户数 943
商品数 1650


In [7]:
# 建立id到index与index到id的映射关系
user_id_to_index_dict = {}
user_index_to_id_dict = {}

item_id_to_index_dict = {}
item_index_to_id_dict = {}

# 枚举遍历所有的user_id
for user_index,user_id in enumerate(user_ids):
    user_id_to_index_dict[user_id] = user_index
    user_index_to_id_dict[user_index] = user_id
# 枚举遍历所有的item_id    
for item_index,item_id in enumerate(item_ids):
    item_id_to_index_dict[item_id] = item_index
    item_index_to_id_dict[item_index] = item_id

In [9]:
# 把训练集中的user_id都变成user_index
df_training_data['user_id'] = df_training_data['user_id'].apply(
    lambda user_id:user_id_to_index_dict[user_id]
)
# 把训练集中的item_id都编程item_index
df_training_data['item_id'] = df_training_data['item_id'].apply(
    lambda item_id:item_id_to_index_dict[item_id]
)

In [11]:
# 修改列名
df_training_data.columns = ['user_index','item_index','rating']
df_training_data.head()

Unnamed: 0,user_index,item_index,rating
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3


### 用户与商品的打分矩阵

In [13]:
# 初始化一个
#     行数=用户个数
#     列数=商品个数
# 的0矩阵，为用户对商品的打分矩阵
# 注意：矩阵的type是浮点数，不是整数
user_item_rating_array = np.zeros(
    shape = (user_quantity,item_quantity)
)
user_item_rating_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
# 对训练集中的user_index进程分组遍历，更新用户物品打分矩阵
for user_index,groupby_userindex in df_training_data.groupby('user_index'):
    items_rating = groupby_userindex.groupby('item_index')['rating'].mean()
    for item_index in items_rating.index:
        user_item_rating_array[user_index,item_index] = items_rating[item_index]
user_item_rating_array

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

### 商品与商品之间的相似度矩阵

In [18]:
# 初始化一个商品对商品的相似度矩阵
item_sim_array = np.zeros(
    shape=(item_quantity,item_quantity)
)
item_sim_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
# 初始化一个字典，记录对item_index打过分的user_index
item_rating_users = {}

for item_index in range(item_quantity):
    item_rating_users[item_index] = np.where(user_item_rating_array[:,item_index] > 0)[0].tolist()

In [53]:
for item_index1 in range(item_quantity):
    item_index1_rating_user_set = set(
        item_rating_users[item_index1]
    )
    for item_index2 in range(item_index1+1,item_quantity):
        item_index2_rating_user_set = set(
            item_rating_users[item_index2]
        )
        # 两个商品的公共用户索引
        union_users = list(
            item_index1_rating_user_set & item_index2_rating_user_set
        )
        if not union_users:sim = 0
        else:
            # 公共用户数
            n = len(union_users)
            # 商品1的打分向量
            v1 = user_item_rating_array[union_users,item_index1]
            # 商品2的打分向量
            v2 = user_item_rating_array[union_users,item_index2]
            sim = 1 - ssd.cosine(v1,v2)
            sim = sim * (n / (1 + np.log(1 +n ))) # 对公共用户的多少进行奖励
            if np.isnan(sim):sim=0
        item_sim_array[item_index1,item_index2] = sim
        item_sim_array[item_index2,item_index1] = sim
    print(item_index1,end='..') 

0..1..2..3..4..5..6..7..8..9..10..11..12..13..14..15..16..17..18..19..20..21..22..23..24..25..26..27..28..29..30..31..32..33..34..35..36..37..38..39..40..41..42..43..44..45..46..47..48..49..50..51..52..53..54..55..56..57..58..59..60..61..62..63..64..65..66..67..68..69..70..71..72..73..74..75..76..77..78..79..80..81..82..83..84..85..86..87..88..89..90..91..92..93..94..95..96..97..98..99..100..101..102..103..104..105..106..107..108..109..110..111..112..113..114..115..116..117..118..119..120..121..122..123..124..125..126..127..128..129..130..131..132..133..134..135..136..137..138..139..140..141..142..143..144..145..146..147..148..149..150..151..152..153..154..155..156..157..158..159..160..161..162..163..164..165..166..167..168..169..170..171..172..173..174..175..176..177..178..179..180..181..182..183..184..185..186..187..188..189..190..191..192..193..194..195..196..197..198..199..200..201..202..203..204..205..206..207..208..209..210..211..212..213..214..215..216..217..218..219..220..221..

..1552..1553..1554..1555..1556..1557..1558..1559..1560..1561..1562..1563..1564..1565..1566..1567..1568..1569..1570..1571..1572..1573..1574..1575..1576..1577..1578..1579..1580..1581..1582..1583..1584..1585..1586..1587..1588..1589..1590..1591..1592..1593..1594..1595..1596..1597..1598..1599..1600..1601..1602..1603..1604..1605..1606..1607..1608..1609..1610..1611..1612..1613..1614..1615..1616..1617..1618..1619..1620..1621..1622..1623..1624..1625..1626..1627..1628..1629..1630..1631..1632..1633..1634..1635..1636..1637..1638..1639..1640..1641..1642..1643..1644..1645..1646..1647..1648..1649..

In [59]:
# 归一化，每一行都除以这一行中的最大值
item_sim_array = item_sim_array / np.array([item_sim_array.max(axis=0)]).T

In [60]:
# 保存3个小数位
item_sim_array = np.around(item_sim_array,3)

In [61]:
item_sim_array

array([[0.   , 0.326, 0.265, ..., 0.   , 0.015, 0.015],
       [0.846, 0.   , 0.303, ..., 0.   , 0.038, 0.038],
       [0.98 , 0.431, 0.   , ..., 0.   , 0.   , 0.054],
       ...,
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [1.   , 1.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [1.   , 1.   , 1.   , ..., 0.   , 0.   , 0.   ]])

### 为用户生成推荐

In [62]:
user_recommend = {}

for user_index in range(user_quantity):
    # 用户喜欢的
    this_user_fav_items = np.where(
        user_item_rating_array[user_index] >= 4
    )[0].tolist()
    # 用户打过分的
    this_user_rated_items = np.where(
        user_item_rating_array[user_index] > 0
    )[0].tolist()
    # 包括用户已经看过的商品推荐
    recommend1 = np.where(
        (item_sim_array[this_user_fav_items] >= 0.8).astype(int).sum(axis=0) > 0 
    )[0].tolist()
    user_recommend[user_index] = list(set(recommend1) - set(this_user_rated_items))

### 读测试集，并验证

In [63]:
df_text_data = pd.read_csv(
    './data/movielen_rating_test.base',
    sep='\t',
    names=['user_id','item_id','rating'],
    usecols=[0,1,2]
)
df_text_data.head()

Unnamed: 0,user_id,item_id,rating
0,1,6,5
1,1,10,3
2,1,12,5
3,1,14,5
4,1,17,3


In [64]:
df_text_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
user_id    20000 non-null int64
item_id    20000 non-null int64
rating     20000 non-null int64
dtypes: int64(3)
memory usage: 468.8 KB


In [65]:
# 删除没有在训练集中出现的用户id
df_text_data = df_text_data[
    df_text_data['user_id'].isin(user_ids)
]

In [66]:
# 删除没有在训练集中出现的商品id
df_text_data = df_text_data[
    df_text_data['item_id'].isin(item_ids)
]

In [67]:
# id转index
df_text_data['user_id'] = df_text_data['user_id'].apply(
    lambda user_id:user_id_to_index_dict[user_id]
)

df_text_data['item_id'] = df_text_data['item_id'].apply(
    lambda item_id:item_id_to_index_dict[item_id]
)


In [68]:
# 变更列名
df_text_data.columns = ['user_index','item_index','rating']
df_text_data.head()

Unnamed: 0,user_index,item_index,rating
0,0,745,5
1,0,135,3
2,0,269,5
3,0,136,5
4,0,538,3


In [69]:
# 生成推荐
user_fav = {}

for user_index,groupby_userindex in df_text_data.groupby('user_index'):
    items_rating = groupby_userindex.groupby('item_index')['rating'].mean()
    user_fav[user_index] = items_rating[items_rating>=4].index.tolist()

In [70]:
recommend_quantity = 0
fav_quantity = 0
union_quantity = 0

for user_index in user_recommend.keys():
    if user_index in user_fav.keys():
        recommend_quantity += len(user_recommend[user_index])
        fav_quantity += len(user_fav[user_index])
        union_quantity += len(
            set(user_recommend[user_index]) & set(user_fav[user_index])
        )
print('准确率',union_quantity / recommend_quantity)
print('召回率',union_quantity / fav_quantity)

准确率 0.22174421323357493
召回率 0.422374632582168
