# 电影推荐

### “u.data”文件里记录了943位用户对1600多部电影的评分情况。
#### 我们首先得把每个用户看过的电影整理成一个列表，这样就能清楚地知道每位用户都看过哪些电影。
#### 接着，我们从这943个用户的观影列表里，去挖掘出电影之间的关联规则，看看哪些电影之间存在比较明显的关联，即经常一起被观看。
#### 挖掘结果可用于电影推荐。

In [1]:
import pandas as pd

from efficient_apriori import apriori                            #导入apriori

# 1. 读取数据集
# 假设你已经下载了 MovieLens 100K 数据集，并且数据文件名为 'u.data'
# 数据集格式：用户ID\t电影ID\t评分\t时间戳，可以用pd.read_csv读入文件，sep='\t'
# 此处提供'u.csv'（该文件是用excel的分列功能，将'u.data'格式改为：用户ID,电影ID,评分,时间戳）
data = pd.read_csv('u.csv', sep=',', header=None, names=['user_id', 'item_id','rank'])

In [2]:
len(data)

100000

In [3]:
# 2. 数据预处理
# 提取用户ID和电影ID，构建用户-电影的交易数据
transactions = data.groupby('user_id')['item_id'].apply(list).tolist()
#执行apply(list)后，结果是系列（索引是 user_id，值是每个用户的 item_id 列表）。再加 .tolist()，结果是一个普通的 Python 列表，其中每个元素是一个用户的item_id列表

len(transactions)

943

In [8]:
#apriori函数求支持度>=0.3并置信度>=0.7的频繁项集与关联规则，item_sets1保存频繁项集，rules1保存规则
item_sets1, rules1 = apriori(transactions, min_support=0.3, min_confidence=0.7)  



In [19]:
item_sets1

{1: {(117,): 378,
  (222,): 365,
  (64,): 283,
  (121,): 429,
  (98,): 390,
  (174,): 420,
  (56,): 394,
  (96,): 295,
  (258,): 509,
  (151,): 326,
  (210,): 331,
  (183,): 291,
  (118,): 293,
  (100,): 508,
  (69,): 321,
  (9,): 299,
  (22,): 297,
  (176,): 284,
  (269,): 315,
  (1,): 452,
  (173,): 324,
  (181,): 507,
  (257,): 303,
  (237,): 384,
  (50,): 583,
  (127,): 413,
  (79,): 336,
  (25,): 293,
  (195,): 301,
  (168,): 316,
  (7,): 392,
  (216,): 290,
  (204,): 350,
  (15,): 293,
  (172,): 367,
  (313,): 350,
  (276,): 298,
  (294,): 485,
  (300,): 431,
  (288,): 478,
  (286,): 481,
  (302,): 297,
  (328,): 295,
  (318,): 298,
  (423,): 300,
  (405,): 344,
  (748,): 316},
 2: {(1, 50): 381,
  (1, 100): 325,
  (1, 117): 289,
  (1, 121): 314,
  (1, 181): 340,
  (7, 50): 324,
  (7, 100): 320,
  (7, 181): 295,
  (50, 56): 330,
  (50, 69): 284,
  (50, 79): 297,
  (50, 98): 335,
  (50, 100): 394,
  (50, 117): 312,
  (50, 121): 362,
  (50, 127): 357,
  (50, 172): 345,
  (50, 173):

In [20]:
#输出各关联规则的支持度、置信度、提升度等
for rule in rules1:
    print(f"Rule: {rule}")



Rule: {1} -> {50} (conf: 0.843, supp: 0.404, lift: 1.363, conv: 2.430)
Rule: {1} -> {100} (conf: 0.719, supp: 0.345, lift: 1.335, conv: 1.642)
Rule: {117} -> {1} (conf: 0.765, supp: 0.306, lift: 1.595, conv: 2.211)
Rule: {121} -> {1} (conf: 0.732, supp: 0.333, lift: 1.527, conv: 1.942)
Rule: {1} -> {181} (conf: 0.752, supp: 0.361, lift: 1.399, conv: 1.866)
Rule: {7} -> {50} (conf: 0.827, supp: 0.344, lift: 1.337, conv: 2.201)
Rule: {7} -> {100} (conf: 0.816, supp: 0.339, lift: 1.515, conv: 2.511)
Rule: {7} -> {181} (conf: 0.753, supp: 0.313, lift: 1.400, conv: 1.868)
Rule: {56} -> {50} (conf: 0.838, supp: 0.350, lift: 1.355, conv: 2.350)
Rule: {69} -> {50} (conf: 0.885, supp: 0.301, lift: 1.431, conv: 3.312)
Rule: {79} -> {50} (conf: 0.884, supp: 0.315, lift: 1.430, conv: 3.289)
Rule: {98} -> {50} (conf: 0.859, supp: 0.355, lift: 1.389, conv: 2.707)
Rule: {100} -> {50} (conf: 0.776, supp: 0.418, lift: 1.255, conv: 1.701)
Rule: {117} -> {50} (conf: 0.825, supp: 0.331, lift: 1.335, conv:

In [11]:
#第2个规则的提升度
rules1[1].lift

1.334728416138248

In [12]:
#设置提升度阈值为2.3
left_threshold = 2.3


# 筛选出提升度大于阈值的规则
filtered_rules = [rule for rule in rules1 if rule.lift > left_threshold]


# 打印筛选后的规则
for rule in filtered_rules:
    print(f"Rule: {rule} with lift: {rule.lift}")



Rule: {174, 181} -> {50, 172} (conf: 0.860, supp: 0.312, lift: 2.350, conv: 4.518) with lift: 2.349707602339181
Rule: {172, 181} -> {50, 174} (conf: 0.927, supp: 0.312, lift: 2.302, conv: 8.229) with lift: 2.3015274780009958
Rule: {50, 174} -> {172, 181} (conf: 0.774, supp: 0.312, lift: 2.302, conv: 2.933) with lift: 2.3015274780009958
Rule: {50, 172} -> {174, 181} (conf: 0.852, supp: 0.312, lift: 2.350, conv: 4.311) with lift: 2.349707602339181


In [21]:
#第二种筛选提升度的方法
lift_threshold = 2.3
rules=[ rules1[i] for i in range(len(rules1)) if rules1[i].lift >lift_threshold]           

In [22]:
#第一个规则的前件
rules[0].rhs

(50, 172)

## 三、根据u.item文件中的电影ID与电影名称，将挖掘出的关联规则中电影ID替换成对应的电影名

In [13]:
import pandas as pd

# 加载u.item文件
u_item = pd.read_csv('u_item.csv', header=None, names=['movie_id', 'movie_title'],usecols=[0,1], encoding='latin-1')

In [14]:
u_item.head()

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [15]:
movie_id_to_title = u_item.set_index('movie_id')['movie_title'].to_dict()
movie_id_to_title

{1: 'Toy Story (1995)',
 2: 'GoldenEye (1995)',
 3: 'Four Rooms (1995)',
 4: 'Get Shorty (1995)',
 5: 'Copycat (1995)',
 6: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 7: 'Twelve Monkeys (1995)',
 8: 'Babe (1995)',
 9: 'Dead Man Walking (1995)',
 10: 'Richard III (1995)',
 11: 'Seven (Se7en) (1995)',
 12: 'Usual Suspects',
 13: 'Mighty Aphrodite (1995)',
 14: 'Postino',
 15: "Mr. Holland's Opus (1995)",
 16: 'French Twist (Gazon maudit) (1995)',
 17: 'From Dusk Till Dawn (1996)',
 18: 'White Balloon',
 19: "Antonia's Line (1995)",
 20: 'Angels and Insects (1995)',
 21: 'Muppet Treasure Island (1996)',
 22: 'Braveheart (1995)',
 23: 'Taxi Driver (1976)',
 24: 'Rumble in the Bronx (1995)',
 25: 'Birdcage',
 26: 'Brothers McMullen',
 27: 'Bad Boys (1995)',
 28: 'Apollo 13 (1995)',
 29: 'Batman Forever (1995)',
 30: 'Belle de jour (1967)',
 31: 'Crimson Tide (1995)',
 32: 'Crumb (1994)',
 33: 'Desperado (1995)',
 34: 'Doom Generation',
 35: 'Free Willy 2: The Adventure Home (1

In [16]:
# 替换规则中的电影ID为电影名
def replace_ids_with_titles(rule):
    lhs = tuple(movie_id_to_title.get(movie_id, movie_id) for movie_id in rule.lhs)
    rhs = tuple(movie_id_to_title.get(movie_id, movie_id) for movie_id in rule.rhs)
    return (lhs, rhs)

# 应用替换函数
rules_with_titles = [replace_ids_with_titles(rule) for rule in filtered_rules]

In [17]:
#替换后，用电影名表示的规则
rules_with_titles

[(('Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)'),
  ('Star Wars (1977)', 'Empire Strikes Back')),
 (('Empire Strikes Back', 'Return of the Jedi (1983)'),
  ('Star Wars (1977)', 'Raiders of the Lost Ark (1981)')),
 (('Star Wars (1977)', 'Raiders of the Lost Ark (1981)'),
  ('Empire Strikes Back', 'Return of the Jedi (1983)')),
 (('Star Wars (1977)', 'Empire Strikes Back'),
  ('Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)'))]

In [18]:
# 打印结果
for rule in rules_with_titles:
    print(f"Rule: {rule[0]} -> {rule[1]}")

Rule: ('Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)') -> ('Star Wars (1977)', 'Empire Strikes Back')
Rule: ('Empire Strikes Back', 'Return of the Jedi (1983)') -> ('Star Wars (1977)', 'Raiders of the Lost Ark (1981)')
Rule: ('Star Wars (1977)', 'Raiders of the Lost Ark (1981)') -> ('Empire Strikes Back', 'Return of the Jedi (1983)')
Rule: ('Star Wars (1977)', 'Empire Strikes Back') -> ('Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)')


~~~
规则：（《失落方舟的掠夺者》（1981年）、《绝地归来》（1983年）——》（《星球大战》（1977年）、《帝国反击》））
规则：（《帝国反击》、《绝地归来》（1983）——《星球大战》（1977）、《迷失方舟的掠夺者》（1981））
规则：（《星球大战（1977）》、《失落方舟的掠夺者》（1981）——（《帝国反击》、《绝地归来》（1983）））
规则：（《星球大战（1977）》、《帝国反击》）——（《迷失方舟的掠夺者》（1981）、《绝地归来》（1983）））