In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import warnings
warnings.filterwarnings('ignore')

### 一、基于项目的协调过滤推荐实现


#### 1.读取数据

In [6]:
columns=['user_id','item_id','rating','timestamp']
user_data=pd.read_csv('../data/u.data',sep='\t',names=columns)
user_data

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [7]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [10]:
columns = ['item_id', 'movie title', 'release date', 
           'video release date', 'IMDb URL', 'unknown', 
           'Action', 'Adventure','Animation', 'Childrens',
           'Comedy', 'Crime', 'Documentary', 'Drama', 
           'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
           'Thriller', 'War', 'Western']
movies_data = pd.read_csv('../data/u.item', 
                     sep='|', 
                     names=columns, 
                     encoding='latin-1')
# 考虑后续计算量较大，这里对原始数据进行随机抽样
movies=movies_data.sample(n=300)
movies.head(3)

Unnamed: 0,item_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
352,353,Deep Rising (1998),30-Jan-1998,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
281,282,"Time to Kill, A (1996)",13-Jul-1996,,http://us.imdb.com/M/title-exact?Time%20to%20K...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
437,438,Amityville 3-D (1983),01-Jan-1983,,http://us.imdb.com/M/title-exact?Amityville%20...,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


#### 2.合并客户评级表和电影信息表

In [14]:
movie_names=movies[['item_id', 'movie title']]
combined_movies_data=pd.merge(user_data,movie_names,on='item_id')
combined_movies_data

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,13,353,4,886261450,Deep Rising (1998)
1,130,353,1,888211764,Deep Rising (1998)
2,532,353,2,886364951,Deep Rising (1998)
3,240,353,1,885775959,Deep Rising (1998)
4,587,353,2,892871706,Deep Rising (1998)
...,...,...,...,...,...
16619,300,1094,5,875650298,"Thin Line Between Love and Hate, A (1996)"
16620,314,1094,1,877887065,"Thin Line Between Love and Hate, A (1996)"
16621,851,1094,1,875730455,"Thin Line Between Love and Hate, A (1996)"
16622,342,1094,3,874984873,"Thin Line Between Love and Hate, A (1996)"


#### 3.生成客户-电影矩阵表


In [25]:
rating_crosstab=combined_movies_data.pivot_table(index='user_id',columns='movie title',values='rating',fill_value=0)
rating_crosstab

movie title,187 (1997),8 Heads in a Duffel Bag (1997),Afterglow (1997),Air Force One (1997),Alphaville (1965),American Strays (1996),Amityville 3-D (1983),"Amityville Horror, The (1979)",Amityville: A New Generation (1993),Amityville: Dollhouse (1996),...,Wend Kuuni (God's Gift) (1982),When We Were Kings (1996),White Man's Burden (1995),White Squall (1996),Wild America (1997),Wild Bill (1995),"Wild Bunch, The (1969)",Wishmaster (1997),With Honors (1994),Young Guns (1988)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2,0,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
940,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
942,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 4.计算电影之间的相似度
在计算电影相似度前创建临时数据表存放相关系数，然后用for循环计算任意两部电影之间的相似系数，结果存放到临时表中    

In [27]:
data_temp=pd.DataFrame(index=rating_crosstab.columns,columns=rating_crosstab.columns)
data_temp

movie title,187 (1997),8 Heads in a Duffel Bag (1997),Afterglow (1997),Air Force One (1997),Alphaville (1965),American Strays (1996),Amityville 3-D (1983),"Amityville Horror, The (1979)",Amityville: A New Generation (1993),Amityville: Dollhouse (1996),...,Wend Kuuni (God's Gift) (1982),When We Were Kings (1996),White Man's Burden (1995),White Squall (1996),Wild America (1997),Wild Bill (1995),"Wild Bunch, The (1969)",Wishmaster (1997),With Honors (1994),Young Guns (1988)
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
187 (1997),,,,,,,,,,,...,,,,,,,,,,
8 Heads in a Duffel Bag (1997),,,,,,,,,,,...,,,,,,,,,,
Afterglow (1997),,,,,,,,,,,...,,,,,,,,,,
Air Force One (1997),,,,,,,,,,,...,,,,,,,,,,
Alphaville (1965),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wild Bill (1995),,,,,,,,,,,...,,,,,,,,,,
"Wild Bunch, The (1969)",,,,,,,,,,,...,,,,,,,,,,
Wishmaster (1997),,,,,,,,,,,...,,,,,,,,,,
With Honors (1994),,,,,,,,,,,...,,,,,,,,,,


In [28]:
# 计算相似度
# 使用for循环计算任意两部电影之间的相似度，结果存放至data_temp数据表中
for i in range(0,len(data_temp.columns)):
    for j in range(0,len(data_temp.columns)):
        data_temp.iloc[i,j]=1-cosine(rating_crosstab.iloc[:,i],rating_crosstab.iloc[:,j])

In [33]:
data_temp

movie title,187 (1997),8 Heads in a Duffel Bag (1997),Afterglow (1997),Air Force One (1997),Alphaville (1965),American Strays (1996),Amityville 3-D (1983),"Amityville Horror, The (1979)",Amityville: A New Generation (1993),Amityville: Dollhouse (1996),...,Wend Kuuni (God's Gift) (1982),When We Were Kings (1996),White Man's Burden (1995),White Squall (1996),Wild America (1997),Wild Bill (1995),"Wild Bunch, The (1969)",Wishmaster (1997),With Honors (1994),Young Guns (1988)
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
187 (1997),1,0.110227,0.170534,0.221464,0.007673,0.01525,0.064299,0.082187,0.086266,0.111369,...,0.0,0.159678,0.105,0.09928,0.149113,0.0,0.076696,0.251716,0.019587,0.067786
8 Heads in a Duffel Bag (1997),0.110227,1,0.0,0.051169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.061698,0.0,0.0,0.189311,0.187044,0.08454,0.117738,0.0,0.06836
Afterglow (1997),0.170534,0.0,1,0.108014,0.0,0.0,0.072169,0.010852,0.096825,0.125,...,0.0,0.085715,0.0,0.0,0.0,0.070868,0.056054,0.0,0.029312,0.038851
Air Force One (1997),0.221464,0.051169,0.108014,1,0.020354,0.060679,0.017056,0.1058,0.022883,0.029542,...,0.0,0.108194,0.083558,0.207198,0.091823,0.07537,0.090369,0.173078,0.103337,0.195116
Alphaville (1965),0.007673,0.0,0.0,0.020354,1,0.0,0.0,0.027914,0.0,0.0,...,0.0,0.0,0.0,0.021677,0.0,0.0,0.082389,0.005464,0.0,0.007138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wild Bill (1995),0.0,0.187044,0.070868,0.07537,0.0,0.0,0.181848,0.082037,0.048795,0.062994,...,0.109109,0.011781,0.237566,0.082582,0.0,1,0.306701,0.022481,0.064012,0.215369
"Wild Bunch, The (1969)",0.076696,0.08454,0.056054,0.090369,0.082389,0.0,0.073973,0.057473,0.049622,0.064062,...,0.0,0.122467,0.196295,0.044791,0.114365,0.306701,1,0.055885,0.023368,0.139377
Wishmaster (1997),0.251716,0.117738,0.0,0.173078,0.005464,0.021719,0.0,0.048197,0.0,0.0,...,0.0,0.064268,0.149539,0.045745,0.159274,0.022481,0.055885,1,0.037194,0.006162
With Honors (1994),0.019587,0.0,0.029312,0.103337,0.0,0.0,0.015043,0.104058,0.020182,0.026055,...,0.045129,0.0,0.055272,0.243202,0.044853,0.064012,0.023368,0.037194,1,0.383316


#### 5.获取相似度系数最大的前10部电影

In [36]:
# 创建临时数据表用于存放电影
data_neighbours=pd.DataFrame(index=data_temp.columns,columns=range(0,11))
data_neighbours.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
187 (1997),,,,,,,,,,,
8 Heads in a Duffel Bag (1997),,,,,,,,,,,
Afterglow (1997),,,,,,,,,,,
Air Force One (1997),,,,,,,,,,,
Alphaville (1965),,,,,,,,,,,


In [41]:
# 提取每个电影最相似的前10个电影（除去本身）
for i in range(0,len(data_temp.columns)):
    data_neighbours.iloc[i,:11]=data_temp.iloc[:,i].sort_values(ascending=False)[:11].index

In [42]:
data_neighbours

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
187 (1997),187 (1997),Naked in New York (1994),G.I. Jane (1997),"Smile Like Yours, A (1997)",Wishmaster (1997),Traveller (1997),Scream (1996),Deep Rising (1998),One Night Stand (1997),Murder at 1600 (1997),Hard Rain (1998)
8 Heads in a Duffel Bag (1997),8 Heads in a Duffel Bag (1997),Nothing Personal (1995),"Sixth Man, The (1997)",Nowhere (1997),Life with Mikey (1993),Gone Fishin' (1997),Love and Other Catastrophes (1996),"Endless Summer 2, The (1994)",Wild America (1997),Wild Bill (1995),"Bye Bye, Love (1995)"
Afterglow (1997),Afterglow (1997),Broken English (1996),Kundun (1997),Love and Death on Long Island (1997),Hurricane Streets (1998),"Apostle, The (1997)",Love and Other Catastrophes (1996),Boys Life (1995),To Live (Huozhe) (1994),"Big One, The (1997)","Butcher Boy, The (1998)"
Air Force One (1997),Air Force One (1997),"Saint, The (1997)",Scream (1996),Murder at 1600 (1997),G.I. Jane (1997),"Peacemaker, The (1997)",Independence Day (ID4) (1996),Tomorrow Never Dies (1997),Men in Black (1997),"Time to Kill, A (1996)",George of the Jungle (1997)
Alphaville (1965),Alphaville (1965),Cyclo (1995),SubUrbia (1997),Nelly & Monsieur Arnaud (1995),Drunks (1995),"Die xue shuang xiong (Killer, The) (1989)",Manhattan (1979),"Seventh Seal, The (Sjunde inseglet, Det) (1957)",My Life as a Dog (Mitt liv som hund) (1985),"Ciao, Professore! (1993)",Rear Window (1954)
...,...,...,...,...,...,...,...,...,...,...,...
Wild Bill (1995),Wild Bill (1995),Amos & Andrew (1993),"Mostro, Il (1994)",I Like It Like That (1994),"Hour of the Pig, The (1993)","Wild Bunch, The (1969)",Dead Presidents (1995),Anna (1996),Body Snatchers (1993),SubUrbia (1997),Angus (1995)
"Wild Bunch, The (1969)","Wild Bunch, The (1969)","Deer Hunter, The (1978)","Third Man, The (1949)","Blue Angel, The (Blaue Engel, Der) (1930)",Vertigo (1958),"Old Man and the Sea, The (1958)",Apocalypse Now (1979),"Godfather: Part II, The (1974)",Wild Bill (1995),Rear Window (1954),Unforgiven (1992)
Wishmaster (1997),Wishmaster (1997),Deep Rising (1998),Leave It to Beaver (1997),"Smile Like Yours, A (1997)",187 (1997),G.I. Jane (1997),George of the Jungle (1997),Nothing Personal (1995),Sphere (1998),Flipper (1996),Murder at 1600 (1997)
With Honors (1994),With Honors (1994),Powder (1995),Robin Hood: Prince of Thieves (1991),Young Guns (1988),Bedknobs and Broomsticks (1971),Operation Dumbo Drop (1995),Top Gun (1986),Pump Up the Volume (1990),Tombstone (1993),It Could Happen to You (1994),Threesome (1994)


### 二、基于用户的协同过滤推荐
思想：    
背景：用户1没有看过电影A，求用户对电影A的喜欢度是多少。
与A电影相似的10部电影中，假定用户1都看过，评分也比较高，那么推测用户1，也比较喜欢电影A    

用户1看过的电影，设置喜欢度为0，目的对喜欢度为0的电影不推荐。因为用户看过了

#### 创建临时数据表data_sims,用于存放用户-电影的相似度

In [46]:
# 创建临时数据表，用于存放相似度系统
data_sims=pd.DataFrame(index=rating_crosstab.index,columns=rating_crosstab.columns)
data_sims

movie title,187 (1997),8 Heads in a Duffel Bag (1997),Afterglow (1997),Air Force One (1997),Alphaville (1965),American Strays (1996),Amityville 3-D (1983),"Amityville Horror, The (1979)",Amityville: A New Generation (1993),Amityville: Dollhouse (1996),...,Wend Kuuni (God's Gift) (1982),When We Were Kings (1996),White Man's Burden (1995),White Squall (1996),Wild America (1997),Wild Bill (1995),"Wild Bunch, The (1969)",Wishmaster (1997),With Honors (1994),Young Guns (1988)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


#### 用相似度评分填充data_sims

In [50]:
# 定义相似度系数评分函数
def getScore(history,similarities):
    return sum(history*similarities)/sum(similarities)

In [53]:
# 请注意，我们将用户已经评分的电影得分为0，因为没有必要再次推荐
for i in range(0,len(data_sims.index)):
    for j in range(0,len(data_sims.columns)):
        user=data_sims.index[i]
        product=data_sims.columns[j]
        
        if rating_crosstab.iloc[i][j]>=1:# 用户1看过的电影，设置喜欢度为0，目的对喜欢度为0的电影不推荐。因为用户看过了
            data_sims.iloc[i][j]=0
        else:
            product_top_names=data_neighbours.loc[product][1:11]#最相似的10部电影
            product_top_sims=data_temp.loc[product].sort_values(ascending=False)[1:11]# 最相似的10部电影相似系数
            user_purchases=rating_crosstab.loc[user,product_top_names]# 用户对top10电影的评分
                                  
            data_sims.iloc[i][j]=getScore(user_purchases,product_top_sims)

In [54]:
data_sims

movie title,187 (1997),8 Heads in a Duffel Bag (1997),Afterglow (1997),Air Force One (1997),Alphaville (1965),American Strays (1996),Amityville 3-D (1983),"Amityville Horror, The (1979)",Amityville: A New Generation (1993),Amityville: Dollhouse (1996),...,Wend Kuuni (God's Gift) (1982),When We Were Kings (1996),White Man's Burden (1995),White Squall (1996),Wild America (1997),Wild Bill (1995),"Wild Bunch, The (1969)",Wishmaster (1997),With Honors (1994),Young Guns (1988)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.823331,0.0,0.0,0.0,0.166878,0.0,0.0,...,0.0,1.778305,0.236309,1.344937,0.0,0.0,1.078054,0.174446,0.989953,0
2,0.286723,0.0,0.0,0,0.0,0.177503,0.0,0.0,0.0,0.0,...,0.0,2.378365,0.0,0.774454,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.360768,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.252474,0.0,0.0,1.027029,0.0,0.0
4,0.382298,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.911958,0.0,0.0,0,0,0,0.23691,...,0.238722,0.402448,0.390927,0.462325,0.258448,0.0,0.0,0.096431,0.215114,1.527879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.563132,0.0,0.0,1.45318,0.0,0.495527,0.0,0.0,0.0,0.0,...,0.0,0.402448,0.0,2.319468,0.0,0.0,0.0,0.494375,0.0,0.0
940,0.0,0.0,0.0,0,0.220621,0.0,0.0,0.0,0.0,0.0,...,0.0,0.320154,0.0,0.0,0.0,0.0,0.423457,0.470259,0.292249,0.622888
941,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.855223,0.0,0.766027,0.0,0.0,0.0,0.0,0.0,0.0
942,0.278374,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.857319,0.0,0.574609,0.28021,0.0,0.399646,0.619352,0.0,0.0
