In [1]:
import pandas as pd

filename = '/Users/juby/Desktop/ml-100k/u.data'
data = pd.read_csv(filename, delimiter='\t', header=None,
                         names=['UserID', 'MovieID', 'Rating', 'Datetime'])
data['Datetime'] = pd.to_datetime(data['Datetime'], unit='s')
data.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [2]:
# 确定用户是否喜欢某部电影
data['Favorable'] = data['Rating'] > 3
data[10:15]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,62,257,2,1997-11-12 22:07:14,False
11,286,1014,5,1997-11-17 15:38:45,True
12,200,222,5,1997-10-05 09:05:40,True
13,210,40,3,1998-03-27 21:59:54,False
14,224,29,3,1998-02-21 23:40:57,False


In [3]:
# 选取前200名用户的打分数据
ratings = data[data['UserID'].isin(range(200))]
# 用户喜欢某部电影的数据集
favorable_ratings = ratings[ratings['Favorable']]
favorable_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
16,122,387,5,1997-11-11 17:47:39,True
20,119,392,4,1998-01-30 16:13:34,True
21,167,486,4,1998-04-16 14:54:12,True
26,38,95,5,1998-04-13 01:14:54,True
28,63,277,4,1997-10-01 23:10:01,True


In [4]:
# 每个用户喜欢的电影
favorable_reviews_by_users = dict((k, frozenset(v.values)) 
                                  for k, v in favorable_ratings.groupby('UserID')['MovieID'])
len(favorable_reviews_by_users)

199

In [5]:
# 每部电影的影迷数量
num_favorable_by_movies = ratings[['MovieID', 'Favorable']].groupby('MovieID').sum()
num_favorable_by_movies.sort_values('Favorable', ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100.0
100,89.0
258,83.0
181,79.0
174,74.0


In [6]:
# 创建函数，接受新发现的频繁项集，创建超集，检测频繁程度
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    # 遍历所有用户和他们的打分数据
    for user, reviews in favorable_reviews_by_users.items():
        # 遍历已找出的项集，判断是否为当前评分项集的子集
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                # 遍历用户打过分却没有出现项集里的电影，用它们生成超集，
                # 并更新该项集的计数
                for other_reviewed_movies in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movies,))
                    counts[current_superset] += 1
    # 返回符合频繁程度的频繁项集
    return dict([(itemset, frequency) for itemset, frequency in counts.items()
                if frequency >= min_support])

In [7]:
import sys
# 把发现的频繁项集保存到以项集长度为键的字典中
frequent_itemsets = {}
min_support = 50
# 为每部电影生成只包含自己的项集
frequent_itemsets[1] = dict((frozenset((movie_id,)), row['Favorable'])
                           for movie_id, row in num_favorable_by_movies.iterrows()
                           if row['Favorable'] > min_support)
print('There are %d movies with more than %d favorable reviews:' % (len(frequent_itemsets[1]),
                                                                    min_support))
sys.stdout.flush()
# 创建循环，运行Apriori算法，存储新发现的项集
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users,
                                                   frequent_itemsets[k-1],
                                                   min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print('Did not find any frequent itemsets of length %d' % k)
        # 确保代码还在运行时，将缓冲区内容输出到终端
        sys.stdout.flush()
        break
    else:
        print('I found %d frequent itemsets of length %d' % (len(cur_frequent_itemsets), k))
        sys.stdout.flush()
del frequent_itemsets[1]

There are 16 movies with more than 50 favorable reviews:
I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11


In [8]:
print('Find a total of %d frequent itemsets' % (sum(len(itemsets)
                                                   for itemsets in frequent_itemsets.values())))

Find a total of 2968 frequent itemsets


In [9]:
print(frequent_itemsets[2].items())

dict_items([(frozenset({1, 7}), 62), (frozenset({1, 50}), 100), (frozenset({56, 1}), 64), (frozenset({64, 1}), 60), (frozenset({1, 79}), 62), (frozenset({1, 98}), 72), (frozenset({1, 100}), 84), (frozenset({1, 127}), 66), (frozenset({1, 172}), 60), (frozenset({1, 174}), 82), (frozenset({1, 181}), 76), (frozenset({9, 7}), 50), (frozenset({50, 7}), 94), (frozenset({56, 7}), 78), (frozenset({64, 7}), 64), (frozenset({79, 7}), 68), (frozenset({98, 7}), 74), (frozenset({100, 7}), 94), (frozenset({127, 7}), 58), (frozenset({172, 7}), 70), (frozenset({174, 7}), 78), (frozenset({181, 7}), 72), (frozenset({258, 7}), 68), (frozenset({9, 50}), 70), (frozenset({56, 9}), 60), (frozenset({64, 9}), 58), (frozenset({9, 98}), 58), (frozenset({9, 100}), 78), (frozenset({9, 127}), 66), (frozenset({9, 174}), 54), (frozenset({9, 181}), 52), (frozenset({56, 50}), 92), (frozenset({64, 50}), 82), (frozenset({50, 79}), 84), (frozenset({50, 98}), 96), (frozenset({50, 100}), 110), (frozenset({50, 127}), 102), (f

In [10]:
# 从频繁项集中抽取关联规则
# 关联规则由前提和结论构成

#遍历不同长度的频繁项集，为每个项集生成规则
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        # 遍历每一项电影，将其作为结论
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise,conclusion))
print('There are %d candidate rules' % len(candidate_rules))
print(candidate_rules[:5])

There are 15285 candidate rules
[(frozenset({7}), 1), (frozenset({1}), 7), (frozenset({50}), 1), (frozenset({1}), 50), (frozenset({1}), 56)]


In [11]:
# 计算置信度
# 存储规则应验和规则无效次数的字典
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
# 遍历所有用户及其喜欢的电影
for user, reviews in favorable_reviews_by_users.items():
    # 遍历每条关联规则
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        # 测试每条规则的前提是否对用户适用
        if premise.issubset(reviews):
            # 测试用户是否喜欢结论中的电影
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
# 置信度
rule_confidence = {candidate_rule: correct_counts[candidate_rule]
                  / float(correct_counts[candidate_rule] +
                         incorrect_counts[candidate_rule])
                  for candidate_rule in candidate_rules}

In [12]:
min_confidence = 0.9
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items()
                  if confidence > min_confidence}
len(rule_confidence)

5152

In [13]:
# 对置信度字典进行排序
# 输出置信度最高的前5条规则
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print('Rule #%d' % (index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print('Rule: If a person recommends %s they will also recommend %d'
         % (premise, conclusion))
    print(' - Confidence: %.3f' % rule_confidence[(premise, conclusion)])
    print('')

Rule #1
Rule: If a person recommends frozenset({98, 181}) they will also recommend 50
 - Confidence: 1.000

Rule #2
Rule: If a person recommends frozenset({172, 79}) they will also recommend 174
 - Confidence: 1.000

Rule #3
Rule: If a person recommends frozenset({258, 172}) they will also recommend 174
 - Confidence: 1.000

Rule #4
Rule: If a person recommends frozenset({1, 181, 7}) they will also recommend 50
 - Confidence: 1.000

Rule #5
Rule: If a person recommends frozenset({1, 172, 7}) they will also recommend 174
 - Confidence: 1.000



In [14]:
# 读取电影名称信息文件
filename2 = '/Users/juby/Desktop/ml-100k/u.item'
moviename_data = pd.read_csv(filename2, delimiter='|', header=None, encoding='mac-roman')
moviename_data.columns = ['MovieID', 'Title', 'Release Date', 'Video Release', 'IMDB',
                         '<UNK>', 'Action', 'Adventure','Animation', 'Children\'s', 'Comedy',
                         'Crime', "Documentary", "Drama", "Fantasy", "Film-Noir",
                         'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
                         'War', 'Western']

In [15]:
# 创建函数，用电影编号获取电影名称
def get_movie_name(movie_id):
    title_object = moviename_data[moviename_data['MovieID'] == movie_id]['Title']
    # 获取电影名称，而不是Series对象
    title = title_object.values[0]
    return title
get_movie_name(4)

'Get Shorty (1995)'

In [16]:
for index in range(5):
    print('Rule #%d' % (index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ', '.join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print('Rule: If a person recommends %s they will also recommend %s'
         % (premise_names, conclusion_name))
    print(' - Confidence: %.3f' % rule_confidence[(premise, conclusion)])
    print('')

Rule #1
Rule: If a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
 - Confidence: 1.000

Rule #2
Rule: If a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000

Rule #3
Rule: If a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000

Rule #4
Rule: If a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Confidence: 1.000

Rule #5
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000



In [17]:
# 在测试集中
# 获取每一位用户最喜欢的电影
test_dataset = data[~data['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset['Favorable']]
test_favorable_by_users = dict((k, frozenset(v.values))
                               for k, v in test_favorable.groupby('UserID')['MovieID'])

In [24]:
# 使用测试数据计算规则应验数量
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
# 计算所有应验规则的置信度
test_confidence = {candidate_rule: correct_counts[candidate_rule] /
                   float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in rule_confidence}
print(len(test_confidence))

5152


In [25]:
sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)
sorted_test_confidence[:5]

[((frozenset({1, 7, 50, 64, 79}), 174), 1.0),
 ((frozenset({1, 7, 64, 79, 98}), 174), 1.0),
 ((frozenset({1, 7, 64, 79, 172}), 174), 1.0),
 ((frozenset({1, 7, 64, 79, 181}), 174), 1.0),
 ((frozenset({1, 56, 64, 79, 172}), 174), 1.0)]

In [27]:
for index in range(10):
    print('Rule #%d' % (index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_name = ', '.join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print('Rule: If a person recommends %s they will also recommend %s'
         % (premise_names, conclusion_name))
    print(' - Train Confidence: %.3f' % rule_confidence.get((premise, conclusion), -1))
    print(' - Test Confidence: %.3f' % test_confidence.get((premise, conclusion), -1))
    print('')

Rule #1
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Train Confidence: 1.000
 - Test Confidence: 0.936

Rule #2
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence: 0.876

Rule #3
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence: 0.841

Rule #4
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Train Confidence: 1.000
 - Test Confidence: 0.932

Rule #5
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of th