In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import warnings
warnings.filterwarnings('ignore')

### 一、基于项目的协调过滤推荐实现


#### 1.读取数据

In [2]:
columns=['user_id','item_id','rating','timestamp']
user_data=pd.read_csv('../data/u.data',sep='\t',names=columns)
user_data

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [4]:
columns = ['item_id', 'movie title', 'release date', 
           'video release date', 'IMDb URL', 'unknown', 
           'Action', 'Adventure','Animation', 'Childrens',
           'Comedy', 'Crime', 'Documentary', 'Drama', 
           'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
           'Thriller', 'War', 'Western']
movies_data = pd.read_csv('../data/u.item', 
                     sep='|', 
                     names=columns, 
                     encoding='latin-1')
# 考虑后续计算量较大，这里对原始数据进行随机抽样
movies=movies_data.sample(n=300)
movies.head(3)

Unnamed: 0,item_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
489,490,To Catch a Thief (1955),01-Jan-1955,,http://us.imdb.com/M/title-exact?To%20Catch%20...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
43,44,Dolores Claiborne (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Dolores%20Cla...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1261,1262,Walking and Talking (1996),12-Jul-1996,,http://us.imdb.com/M/title-exact?Walking%20and...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


#### 2.合并客户评级表和电影信息表

In [5]:
movie_names=movies[['item_id', 'movie title']]
combined_movies_data=pd.merge(user_data,movie_names,on='item_id')
combined_movies_data

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,22,377,1,878887116,Heavyweights (1994)
1,5,377,1,878844615,Heavyweights (1994)
2,314,377,3,877890982,Heavyweights (1994)
3,13,377,1,882399219,Heavyweights (1994)
4,222,377,1,881060205,Heavyweights (1994)
...,...,...,...,...,...
17532,405,1567,1,885547123,Careful (1992)
17533,787,1671,1,888980193,"Further Gesture, A (1996)"
17534,181,1325,1,878962816,August (1996)
17535,655,1636,4,887473570,Brothers in Trouble (1995)


#### 3.生成客户-电影矩阵表


In [6]:
rating_crosstab=combined_movies_data.pivot_table(index='user_id',columns='movie title',values='rating',fill_value=0)
rating_crosstab

movie title,1-900 (1994),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),Above the Rim (1994),Ace Ventura: Pet Detective (1994),"Age of Innocence, The (1993)",Alien (1979),Alphaville (1965),Amateur (1994),"Amazing Panda Adventure, The (1995)",...,Warriors of Virtue (1997),When Night Is Falling (1995),White Man's Burden (1995),Wings of Courage (1995),Wings of Desire (1987),"Wings of the Dove, The (1997)",Witness (1985),"World of Apu, The (Apur Sansar) (1959)",You So Crazy (1994),Zeus and Roxanne (1997)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,4,0,3,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,4,0,0,3,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
940,0,0,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
942,0,0,3,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 4.计算电影之间的相似度
在计算电影相似度前创建临时数据表存放相关系数，然后用for循环计算任意两部电影之间的相似系数，结果存放到临时表中    

In [7]:
data_temp=pd.DataFrame(index=rating_crosstab.columns,columns=rating_crosstab.columns)
data_temp

movie title,1-900 (1994),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),Above the Rim (1994),Ace Ventura: Pet Detective (1994),"Age of Innocence, The (1993)",Alien (1979),Alphaville (1965),Amateur (1994),"Amazing Panda Adventure, The (1995)",...,Warriors of Virtue (1997),When Night Is Falling (1995),White Man's Burden (1995),Wings of Courage (1995),Wings of Desire (1987),"Wings of the Dove, The (1997)",Witness (1985),"World of Apu, The (Apur Sansar) (1959)",You So Crazy (1994),Zeus and Roxanne (1997)
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
"20,000 Leagues Under the Sea (1954)",,,,,,,,,,,...,,,,,,,,,,
2001: A Space Odyssey (1968),,,,,,,,,,,...,,,,,,,,,,
Above the Rim (1994),,,,,,,,,,,...,,,,,,,,,,
Ace Ventura: Pet Detective (1994),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wings of the Dove, The (1997)",,,,,,,,,,,...,,,,,,,,,,
Witness (1985),,,,,,,,,,,...,,,,,,,,,,
"World of Apu, The (Apur Sansar) (1959)",,,,,,,,,,,...,,,,,,,,,,
You So Crazy (1994),,,,,,,,,,,...,,,,,,,,,,


In [8]:
# 计算相似度
# 使用for循环计算任意两部电影之间的相似度，结果存放至data_temp数据表中
for i in range(0,len(data_temp.columns)):
    for j in range(0,len(data_temp.columns)):
        data_temp.iloc[i,j]=1-cosine(rating_crosstab.iloc[:,i],rating_crosstab.iloc[:,j])

In [9]:
data_temp

movie title,1-900 (1994),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),Above the Rim (1994),Ace Ventura: Pet Detective (1994),"Age of Innocence, The (1993)",Alien (1979),Alphaville (1965),Amateur (1994),"Amazing Panda Adventure, The (1995)",...,Warriors of Virtue (1997),When Night Is Falling (1995),White Man's Burden (1995),Wings of Courage (1995),Wings of Desire (1987),"Wings of the Dove, The (1997)",Witness (1985),"World of Apu, The (Apur Sansar) (1959)",You So Crazy (1994),Zeus and Roxanne (1997)
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-900 (1994),1,0.010055,0.067038,0.0,0.064139,0.005358,0.060471,0.388228,0.176805,0.031798,...,0.0,0.0,0.020752,0.0,0.160592,0.0,0.0,0.0,0.0,0.07964
"20,000 Leagues Under the Sea (1954)",0.010055,1,0.456281,0.086562,0.244643,0.272227,0.44216,0.023606,0.089939,0.092806,...,0.214299,0.03863,0.148055,0.098907,0.108365,0.083422,0.0,0.094304,0.098907,0.017218
2001: A Space Odyssey (1968),0.067038,0.456281,1,0.084021,0.280522,0.320101,0.60581,0.162802,0.144732,0.063215,...,0.112173,0.065124,0.111392,0.075792,0.339092,0.11368,0.075792,0.114179,0.045475,0.026388
Above the Rim (1994),0.0,0.086562,0.084021,1,0.192812,0.0,0.109484,0.0,0.0,0.0,...,0.175038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ace Ventura: Pet Detective (1994),0.064139,0.244643,0.280522,0.192812,1,0.162558,0.347172,0.02868,0.129078,0.147209,...,0.05708,0.0352,0.167617,0.0,0.052727,0.032055,0.090126,0.04583,0.060084,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wings of the Dove, The (1997)",0.0,0.083422,0.11368,0.0,0.032055,0.209946,0.096277,0.016977,0.060294,0.063567,...,0.0,0.03572,0.0,0.0,0.174137,1,0.0,0.0436,0.0,0.021227
Witness (1985),0.0,0.0,0.075792,0.0,0.090126,0.0,0.056648,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0
"World of Apu, The (Apur Sansar) (1959)",0.0,0.094304,0.114179,0.0,0.04583,0.00335,0.082368,0.0,0.130051,0.0,...,0.0,0.0,0.012975,0.0,0.097364,0.0436,0.0,1,0.0,0.0
You So Crazy (1994),0.0,0.098907,0.045475,0.0,0.060084,0.070273,0.056648,0.0,0.0,0.0,...,0.0,0.390567,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0


#### 5.获取相似度系数最大的前10部电影

In [10]:
# 创建临时数据表用于存放电影
data_neighbours=pd.DataFrame(index=data_temp.columns,columns=range(0,11))
data_neighbours.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-900 (1994),,,,,,,,,,,
"20,000 Leagues Under the Sea (1954)",,,,,,,,,,,
2001: A Space Odyssey (1968),,,,,,,,,,,
Above the Rim (1994),,,,,,,,,,,
Ace Ventura: Pet Detective (1994),,,,,,,,,,,


In [11]:
# 提取每个电影最相似的前10个电影（除去本身）
for i in range(0,len(data_temp.columns)):
    data_neighbours.iloc[i,:11]=data_temp.iloc[:,i].sort_values(ascending=False)[:11].index

In [None]:
data_neighbours

### 二、基于用户的协同过滤推荐
思想：    
背景：用户1没有看过电影A，求用户对电影A的喜欢度是多少。
与A电影相似的10部电影中，假定用户1都看过，评分也比较高，那么推测用户1，也比较喜欢电影A    

用户1看过的电影，设置喜欢度为0，目的对喜欢度为0的电影不推荐。因为用户看过了

#### 创建临时数据表data_sims,用于存放用户-电影的相似度

In [12]:
# 创建临时数据表，用于存放相似度系统
data_sims=pd.DataFrame(index=rating_crosstab.index,columns=rating_crosstab.columns)
data_sims

movie title,1-900 (1994),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),Above the Rim (1994),Ace Ventura: Pet Detective (1994),"Age of Innocence, The (1993)",Alien (1979),Alphaville (1965),Amateur (1994),"Amazing Panda Adventure, The (1995)",...,Warriors of Virtue (1997),When Night Is Falling (1995),White Man's Burden (1995),Wings of Courage (1995),Wings of Desire (1987),"Wings of the Dove, The (1997)",Witness (1985),"World of Apu, The (Apur Sansar) (1959)",You So Crazy (1994),Zeus and Roxanne (1997)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


#### 用相似度评分填充data_sims

In [13]:
# 定义相似度系数评分函数
def getScore(history,similarities):
    return sum(history*similarities)/sum(similarities)

In [None]:
# 请注意，我们将用户已经评分的电影得分为0，因为没有必要再次推荐
for i in range(0,len(data_sims.index)):
    for j in range(0,len(data_sims.columns)):
        user=data_sims.index[i]
        product=data_sims.columns[j]
        
        if rating_crosstab.iloc[i][j]>=1:# 用户1看过的电影，设置喜欢度为0，目的对喜欢度为0的电影不推荐。因为用户看过了
            data_sims.iloc[i][j]=0
        else:
            product_top_names=data_neighbours.loc[product][1:11]#最相似的10部电影
            product_top_sims=data_temp.loc[product].sort_values(ascending=False)[1:11]# 最相似的10部电影相似系数
            user_purchases=rating_crosstab.loc[user,product_top_names]# 用户对top10电影的评分
                                  
            data_sims.iloc[i][j]=getScore(user_purchases,product_top_sims)

In [None]:
data_sims