In [88]:
import pandas as pd
from pathlib import Path

In [89]:
file_path = 'gephi_edges.csv'

In [90]:
all_ratings = pd.read_csv(file_path, header=0)

In [91]:
all_ratings['Favorable'] = ~all_ratings['target'].isnull()
#print(all_ratings['source'].max())
ratings = all_ratings[all_ratings['source'].isin(range(396))]

In [92]:
favorable_ratings = all_ratings[all_ratings['Favorable']]

In [93]:
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby('source')['target'])

In [94]:
num_favorable_by_story = all_ratings[['target', 'Favorable']].groupby('target').sum()

In [95]:
num_favorable_by_story.sort_values('Favorable', ascending=False).head()

Unnamed: 0_level_0,Favorable
target,Unnamed: 1_level_1
4.0,181
34.0,154
21.0,130
22.0,123
36.0,119


In [96]:
frequent_itemsets = dict()
min_support = 50
frequent_itemsets[1] = dict((frozenset((target,)), row['Favorable']) for target, row in num_favorable_by_story.iterrows() if row['Favorable'] > min_support)

In [97]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [98]:
import sys
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print(f"Did not find any frequent itemsets of length {k}.")
        sys.stdout.flush()
        break
    else:
        print(f"I found {len(cur_frequent_itemsets)} frequent itemsets of length {k}.")
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets

I found 413 frequent itemsets of length 2.
I found 2208 frequent itemsets of length 3.
I found 6426 frequent itemsets of length 4.
I found 11402 frequent itemsets of length 5.
I found 12617 frequent itemsets of length 6.
I found 8799 frequent itemsets of length 7.
I found 3709 frequent itemsets of length 8.
I found 844 frequent itemsets of length 9.
I found 85 frequent itemsets of length 10.
I found 2 frequent itemsets of length 11.
Did not find any frequent itemsets of length 12.


In [99]:
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))

In [100]:
correct_counts = defaultdict()
incorrect_counts =  defaultdict()

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [101]:
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/float
(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for 
candidate_rule in candidate_rules} 

In [102]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), 
reverse=True)

for index in range(5):
    print(f"Rule #{index+1}")
    (premise, conclusion) = sorted_confidence[index][0]
    print(f"Rule: If a person recommends {premise} they will also recommend {conclusion}.")
    print(f"- Confidence: {rule_confidence[(premise, conclusion)]}")
    print("")

Rule #1
Rule: If a person recommends frozenset({102.0}) they will also recommend 4.0.
- Confidence: 1.0

Rule #2
Rule: If a person recommends frozenset({115.0, 78.0}) they will also recommend 27.0.
- Confidence: 1.0

Rule #3
Rule: If a person recommends frozenset({27.0, 94.0}) they will also recommend 115.0.
- Confidence: 1.0

Rule #4
Rule: If a person recommends frozenset({34.0, 115.0}) they will also recommend 27.0.
- Confidence: 1.0

Rule #5
Rule: If a person recommends frozenset({74.0, 108.0}) they will also recommend 101.0.
- Confidence: 1.0



In [103]:
from all_names import all_author_and_student_names

In [104]:
names = all_author_and_student_names
names_df = pd.DataFrame.from_dict(names, orient='index')
names_df.index.names = ['id']
names_df.columns = ['text/student_name']
names_df


Unnamed: 0_level_0,text/student_name
id,Unnamed: 1_level_1
125,JoleneA
126,ChristianB
127,EmmaB
128,JordanC
129,ClaytonC
...,...
392,RosaiselaC
393,SavannahJ
394,StephanieC
395,WilliamA


In [105]:
def get_story_name(story_number):
    story_name = names_df[names_df.index == names_df]['text/student_name']
    title = story_name.values[0]
    return(title)

In [106]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_story_name(idx) for idx in premise)
    conclusion_name = get_story_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print("")

Rule #1


ValueError: Unable to coerce to Series, length must be 1: given 272