In [63]:
import pandas as pd
all_ratings = pd.read_csv('small_ratings.csv',header=None,skiprows = 1,
                          names = ["UserID", "MovieID", "Rating", "Datetime"])

In [64]:
all_ratings['Favorable'] = all_ratings['Rating']>3
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
favorable_ratings = ratings[ratings['Favorable']]
favorable_reviews_by_users = dict((k,frozenset(v.values)) for k,v in 
                                 favorable_ratings.groupby('UserID')['MovieID'])
num_favorable_by_movie = ratings[['MovieID','Favorable']].groupby('MovieID').sum()

In [65]:
#Apriori算法
frequent_itemsets = {}
min_support = 50
frequent_itemsets[1] = dict((frozenset((movie_id,)),row['Favorable']) for 
                           movie_id,row in num_favorable_by_movie.iterrows()
                           if row['Favorable']>=min_support)

In [66]:
from collections import defaultdict
def find_frequent_itemset(favorable_reviews_by_users,k_1_itemsets,min_support):
    counts = defaultdict(int)
    for user,reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews-itemset:
                    current_superset = itemset|frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict((itemset,frequency) for itemset,frequency in counts.items() if frequency>=min_support)

In [67]:
import sys
for k in range(2,20):
    current_frequent_itemset = find_frequent_itemset(favorable_reviews_by_users,
                                                    frequent_itemsets[k-1],min_support)
    frequent_itemsets[k] = current_frequent_itemset
    if len(current_frequent_itemset)==0:
        print('Didn\'t find any frequent itemset of length {}'.format(k))
        sys.stdout.flush()
        break
    else:
        print('I find {0} frequent itemset of length {1}'.format(len(current_frequent_itemset),k))
        sys.stdout.flush()

I find 171 frequent itemset of length 2
I find 666 frequent itemset of length 3
I find 1464 frequent itemset of length 4
I find 1952 frequent itemset of length 5
I find 1645 frequent itemset of length 6
I find 880 frequent itemset of length 7
I find 282 frequent itemset of length 8
I find 47 frequent itemset of length 9
I find 3 frequent itemset of length 10
Didn't find any frequent itemset of length 11


In [68]:
del(frequent_itemsets[1])

In [69]:
frequent_itemsets

{2: {frozenset({47, 527}): 58,
  frozenset({47, 296}): 86,
  frozenset({47, 50}): 56,
  frozenset({47, 589}): 56,
  frozenset({50, 527}): 66,
  frozenset({50, 296}): 84,
  frozenset({110, 527}): 64,
  frozenset({110, 296}): 74,
  frozenset({110, 589}): 64,
  frozenset({110, 480}): 56,
  frozenset({296, 527}): 88,
  frozenset({296, 589}): 74,
  frozenset({296, 480}): 66,
  frozenset({480, 527}): 56,
  frozenset({480, 589}): 64,
  frozenset({527, 589}): 62,
  frozenset({110, 2858}): 50,
  frozenset({110, 318}): 68,
  frozenset({110, 356}): 78,
  frozenset({296, 2959}): 80,
  frozenset({296, 2858}): 86,
  frozenset({296, 318}): 106,
  frozenset({296, 356}): 98,
  frozenset({318, 2959}): 74,
  frozenset({318, 2858}): 70,
  frozenset({318, 356}): 106,
  frozenset({356, 2959}): 60,
  frozenset({356, 2858}): 68,
  frozenset({2858, 2959}): 76,
  frozenset({260, 1196}): 94,
  frozenset({260, 1198}): 76,
  frozenset({260, 1210}): 84,
  frozenset({260, 1270}): 66,
  frozenset({260, 296}): 84,
  f

In [70]:
candidate_rules = []
for itemset_length,itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset-set((conclusion,))
            candidate_rules.append((premise,frozenset((conclusion,))))

In [71]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user,reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise,conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion.issubset(reviews):
                correct_counts[candidate_rule]+=1
            else:
                incorrect_counts[candidate_rule]+=1
rule_confidence = {candidate_rule:correct_counts[candidate_rule]/
                   float(correct_counts[candidate_rule]+
                         incorrect_counts[candidate_rule]) for 
                  candidate_rule in candidate_rules}

In [72]:
from operator import itemgetter
sorted_rule_confidence = sorted(rule_confidence.items(),key=itemgetter(1),reverse=True)

In [73]:
for i in range(5):
    print('Rule #{}'.format(i+1))
    premise,conclusion = sorted_rule_confidence[i][0]
    print('Rule: If one recommand {0} he could also recommand {1}'.format
          (premise,conclusion))
    print('-Confidence: {}\n'.format(sorted_rule_confidence[i][1]))

Rule #1
Rule: If one recommand frozenset({1210, 589}) he could also recommand frozenset({1196})
-Confidence: 1.0

Rule #2
Rule: If one recommand frozenset({260, 47}) he could also recommand frozenset({296})
-Confidence: 1.0

Rule #3
Rule: If one recommand frozenset({50, 47, 527}) he could also recommand frozenset({296})
-Confidence: 1.0

Rule #4
Rule: If one recommand frozenset({296, 1210, 589}) he could also recommand frozenset({1196})
-Confidence: 1.0

Rule #5
Rule: If one recommand frozenset({296, 480, 1196}) he could also recommand frozenset({260})
-Confidence: 1.0



In [76]:
#评估算法
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset['Favorable']]
test_favorable_by_users = dict((k,frozenset(v.values)) for k,v in 
                              test_favorable.groupby('UserID')['MovieID'])

In [78]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user,reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise,conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion.issubset(reviews):
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule]+=1
test_confidence = {candidate_rule:correct_counts[candidate_rule]/
                  float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])
                  for candidate_rule in candidate_rules}

In [82]:
for i in range(5):
    premise,conclusion = sorted_rule_confidence[i][0]
    print('Rule #{}'.format(i+1))
    print('Rule: If one recommand {0} he could also recommand {1}'.format(premise,conclusion))
    print('-Train confidence: {}'.format(rule_confidence[(premise,conclusion)]))
    print('-Test confidence: {}\n'.format(test_confidence[(premise,conclusion)]))

Rule #1
Rule: If one recommand frozenset({1210, 589}) he could also recommand frozenset({1196})
-Train confidence: 1.0
-Test confidence: 0.9032258064516129

Rule #2
Rule: If one recommand frozenset({260, 47}) he could also recommand frozenset({296})
-Train confidence: 1.0
-Test confidence: 0.875

Rule #3
Rule: If one recommand frozenset({50, 47, 527}) he could also recommand frozenset({296})
-Train confidence: 1.0
-Test confidence: 0.8809523809523809

Rule #4
Rule: If one recommand frozenset({296, 1210, 589}) he could also recommand frozenset({1196})
-Train confidence: 1.0
-Test confidence: 0.9444444444444444

Rule #5
Rule: If one recommand frozenset({296, 480, 1196}) he could also recommand frozenset({260})
-Train confidence: 1.0
-Test confidence: 0.8918918918918919

