In [1]:
import numpy as np
import pandas as pd
from pathlib import Path


In [2]:
file_path = 'gephi_edges.csv'

In [3]:
all_ratings = pd.read_csv(file_path, header=0)
all_ratings.head()
all_ratings = all_ratings.dropna(axis=0)

In [4]:
all_ratings['Favorable'] = ~all_ratings['target'].isnull()
#print(all_ratings['source'].max())
ratings = all_ratings[all_ratings['source'].isin(range(396))]

In [5]:
favorable_ratings = all_ratings[all_ratings['Favorable']]

In [6]:
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby('source')['target'])

In [7]:
num_favorable_by_story = all_ratings[['target', 'Favorable']].groupby('target').sum()

In [8]:
num_favorable_by_story.sort_values('Favorable', ascending=False).head()

Unnamed: 0_level_0,Favorable
target,Unnamed: 1_level_1
4.0,176
34.0,123
21.0,123
78.0,116
84.0,116


In [9]:
frequent_itemsets = dict()
min_support = 50
frequent_itemsets[1] = dict((frozenset((target,)), row['Favorable']) for target, row in num_favorable_by_story.iterrows() if row['Favorable'] > min_support)

In [10]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [11]:
import sys
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print(f"Did not find any frequent itemsets of length {k}.")
        sys.stdout.flush()
        break
    else:
        print(f"I found {len(cur_frequent_itemsets)} frequent itemsets of length {k}.")
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets

I found 352 frequent itemsets of length 2.
I found 1720 frequent itemsets of length 3.
I found 4555 frequent itemsets of length 4.
I found 7153 frequent itemsets of length 5.
I found 6508 frequent itemsets of length 6.
I found 3347 frequent itemsets of length 7.
I found 922 frequent itemsets of length 8.
I found 115 frequent itemsets of length 9.
I found 4 frequent itemsets of length 10.
Did not find any frequent itemsets of length 11.


In [12]:
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))

In [13]:
correct_counts = defaultdict()
incorrect_counts =  defaultdict()

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [14]:
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/float
(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for 
candidate_rule in candidate_rules} 

In [15]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), 
reverse=True)

for index in range(5):
    print(f"Rule #{index+1}")
    (premise, conclusion) = sorted_confidence[index][0]
    print(f"Rule: If a person recommends {premise} they will also recommend {conclusion}.")
    print(f"- Confidence: {rule_confidence[(premise, conclusion)]}")
    print("")

Rule #1
Rule: If a person recommends frozenset({102.0}) they will also recommend 4.0.
- Confidence: 1.0

Rule #2
Rule: If a person recommends frozenset({115.0, 78.0}) they will also recommend 27.0.
- Confidence: 1.0

Rule #3
Rule: If a person recommends frozenset({27.0, 94.0}) they will also recommend 115.0.
- Confidence: 1.0

Rule #4
Rule: If a person recommends frozenset({34.0, 115.0}) they will also recommend 27.0.
- Confidence: 1.0

Rule #5
Rule: If a person recommends frozenset({74.0, 108.0}) they will also recommend 101.0.
- Confidence: 1.0



In [16]:
from all_names import all_author_and_student_names

In [17]:
names = all_author_and_student_names
type(names)


dict

In [18]:
#names_df = pd.DataFrame.from_dict(names, orient='index', columns=['id', 'text'])
#names_df = pd.DataFrame.from_dict(names, orient="index")
#names_df = pd.DataFrame.from_records(names, columns=['id', 'text'])
#names_df.head()
#print(names_df.tail())

In [22]:
def get_story_name(story_number):
    story_title = names[story_number]
    return(' '.join(story_title))

In [30]:
test_title = get_story_name(114)
print(test_title)

Steve Duffy Duffy In the Lion's Den


In [32]:
for index in range(100):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_story_name(idx) for idx in premise)
    conclusion_names = get_story_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_names))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends Jeff VanderMeer VanderMeer The Cage they will also recommend Jeff VanderMeer Ann VanderMeer Introduction
 - Confidence: 1.000

Rule #2
Rule: If a person recommends Stephen Graham Jones Graham Jones Little Lambs, Elizabeth Hand Elizabeth Hand The Boy in the Tree they will also recommend Charles Lamb Lamb
 - Confidence: 1.000

Rule #3
Rule: If a person recommends Charles Lamb Lamb, Stepan Chapman Stepan Chapman The Stiff and the Stile they will also recommend Stephen Graham Jones Graham Jones Little Lambs
 - Confidence: 1.000

Rule #4
Rule: If a person recommends Leonora Carrington Leonora Carrington White Rabbits, Stephen Graham Jones Graham Jones Little Lambs they will also recommend Charles Lamb Lamb
 - Confidence: 1.000

Rule #5
Rule: If a person recommends Garry Kilworth Garry Kilworth Hogfoot Right and Bird-hands, Margo Lanagan Margo Lanagan Singing My Sister Down they will also recommend Neil Gaiman Neil Gaiman Feeders and Eaters
 - Confidence: