In [1]:
# Resource: https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python

## Import packages

In [2]:
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
from apyori import apriori
# from mlxtend.frequent_patterns import apriori, association_rules

## Import data

In [3]:
df = pd.read_json('goodreads_reviews_children.json', lines= True)

## Filter for Reviews of Top 10 Books

In [4]:
book_selection = df.groupby('book_id').size().reset_index()
book_selection = book_selection.rename(columns = {0:'value_count'}).sort_values('value_count', ascending = False)
print(book_selection.head(30))
book_selection = book_selection['book_id'][:10]

         book_id  value_count
88          3636         6156
0              5         4696
74564   11387515         4499
4678      157993         3114
2403       78411         1999
70142    9673436         1947
323         8127         1721
9794      370493         1697
75083   11594337         1575
54          2998         1524
1341       38709         1450
818        24178         1449
1399       39988         1387
104579  23302416         1351
10257     389627         1320
89750   17349203         1288
6662      236093         1184
14           378         1115
43          2839         1065
2562       83369         1058
1658       47281          974
480        13023          960
831        24213          940
1247       37186          923
1055       30119          915
190         6319          908
106992  24612624          907
1251       37190          899
695        19543          893
86257   16052012          869


In [5]:
df = df[df['book_id'].isin(book_selection)]
df

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
46,7b2e5fe9fd353fecf3eeebb4850b88d3,11594337,0cef00e7ac5b37c459c94083475413f7,5,"great story, heartwarming and very current wit...",Sat Jan 05 17:30:28 -0800 2013,Sun Jan 27 19:35:53 -0800 2013,,,0,0
54,0ef32090550901ead25cb0ea21c4d36b,157993,f379671d83939b72314ef2439d44e372,4,I read this in French during high school (Fren...,Wed Nov 21 16:45:09 -0800 2012,Wed Nov 21 16:45:46 -0800 2012,,,0,0
63,d37b46b2190ed7c518259f29b47a9b36,3636,ac7aae56c65adfde94bb8a9e653f67d3,5,A quick but engrossing dystopian novel. The re...,Thu Mar 30 11:33:22 -0700 2017,Thu Mar 30 18:10:44 -0700 2017,Thu Mar 30 00:00:00 -0700 2017,Wed Mar 29 00:00:00 -0700 2017,4,0
70,f4c6fe33ef61c38f7f4aeb5224c259a5,11387515,13f03d0906f27f5202865787efdf1bbc,4,I love how simple the story was told but the m...,Sun Nov 04 00:40:58 -0700 2012,Mon Nov 12 04:14:02 -0800 2012,Mon Nov 12 04:14:02 -0800 2012,Sun Nov 04 00:00:00 -0700 2012,0,0
71,f4c6fe33ef61c38f7f4aeb5224c259a5,157993,45c234cd783da040e9da8c3e81df0297,5,We should see with our hearts not with our eye...,Fri Nov 02 07:37:50 -0700 2012,Fri Apr 26 06:56:47 -0700 2013,,,1,0
...,...,...,...,...,...,...,...,...,...,...,...
734590,8970fe60796274ca4b58eeabfe6fb78e,5,a6a0628171a2dea1c09f87032ffb02d8,5,Love the books,Thu Nov 24 02:51:32 -0800 2016,Thu Nov 24 02:52:39 -0800 2016,Fri Jan 01 00:00:00 -0800 2016,Fri Jan 01 00:00:00 -0800 2016,0,0
734606,31a2c453c1742edd9732dff77ffc8a50,11387515,a5761512de5a6f1af1b60065e88924c6,5,Awesome book about the imperfections of humani...,Fri Mar 30 07:42:41 -0700 2012,Fri Mar 30 07:47:29 -0700 2012,Tue Mar 27 00:00:00 -0700 2012,,0,0
734613,ce27420f5dbfcecc92057a4000345027,2998,261b1729ad5c4331ca5ea490f4a0dea0,3,I was great to read an old classic that i reme...,Thu May 31 12:00:25 -0700 2012,Thu May 31 12:01:19 -0700 2012,,,0,0
734624,354ffdcbf956c820137b85bf8b957b9e,157993,84f15a500176f5c26d799614001bcc75,4,It's quite difficult for a layman like me to r...,Sat Jun 02 11:21:30 -0700 2012,Tue Jun 05 11:44:33 -0700 2012,Mon Jun 04 00:00:00 -0700 2012,Sat Jun 02 00:00:00 -0700 2012,0,0


In [6]:
df['dummy'] = 0

# This joins all the review_text for each book.
df_grouped = df.groupby(['dummy'])['review_text'].apply(' '.join).reset_index()
df_grouped

Unnamed: 0,dummy,review_text
0,0,"great story, heartwarming and very current wit..."


## Prepare Data for Apriori Algorithm

In [7]:
# This maps nltk sent_tokenize to review_text.
df_grouped['sent_tokens'] = df_grouped['review_text'].map(nltk.sent_tokenize)
df_grouped['n_sentences'] = df_grouped['sent_tokens'].map(len)
df_grouped

Unnamed: 0,dummy,review_text,sent_tokens,n_sentences
0,0,"great story, heartwarming and very current wit...","[great story, heartwarming and very current wi...",138392


In [8]:
# This function iterates through a list of sentences and word tokenizes each sentence.
def word_tokenize_helper(sent_tokens):
    
    word_tokens = []
    
    for sentence in sent_tokens:
        word_tokens.append(word_tokenize(sentence))
        
    return word_tokens

In [9]:
def len2(list_of_lists):
    
    count = 0
    
    for lst in list_of_lists:
        count = len(lst) + count
        
    return count

In [10]:
df_grouped['word_tokens'] = df_grouped['sent_tokens'].map(word_tokenize_helper)
df_grouped['n_words'] = df_grouped['word_tokens'].map(len2)
df_grouped

Unnamed: 0,dummy,review_text,sent_tokens,n_sentences,word_tokens,n_words
0,0,"great story, heartwarming and very current wit...","[great story, heartwarming and very current wi...",138392,"[[great, story, ,, heartwarming, and, very, cu...",2467455


In [11]:
# This function iterates through a list of lists of word tokens and POS tags each word.
def pos_tag_helper(word_tokens):
    
    tagged_words = []
    
    for word in word_tokens:
        tagged_words.append(nltk.pos_tag(word))
        
    return tagged_words

In [12]:
df_grouped['tagged_words'] = df_grouped['word_tokens'].map(pos_tag_helper)
df_grouped['n_tagged_words'] = df_grouped['tagged_words'].map(len2)
df_grouped

Unnamed: 0,dummy,review_text,sent_tokens,n_sentences,word_tokens,n_words,tagged_words,n_tagged_words
0,0,"great story, heartwarming and very current wit...","[great story, heartwarming and very current wi...",138392,"[[great, story, ,, heartwarming, and, very, cu...",2467455,"[[(great, JJ), (story, NN), (,, ,), (heartwarm...",2467455


In [13]:
stop_list = stopwords.words('english')

stop_list.append('book')

In [14]:
def clean_text(tagged_words):
    
    cleaned_text = []
    
    for sentence in tagged_words:
        lst = []
        
        for (word, tag) in sentence:
            # Check if word contains only alphabet characters and is tagged as NN
            if re.search('^[a-z]+$', word) and word not in stop_list and tag == 'NN':
                lst.append((word, tag))

        if len(lst) > 0:
            cleaned_text.append(lst)
        
    return cleaned_text

In [15]:
df_grouped['cleaned_words'] = df_grouped['tagged_words'].map(clean_text)
df_grouped['n_cleaned_words'] = df_grouped['cleaned_words'].map(len2)
df_grouped

Unnamed: 0,dummy,review_text,sent_tokens,n_sentences,word_tokens,n_words,tagged_words,n_tagged_words,cleaned_words,n_cleaned_words
0,0,"great story, heartwarming and very current wit...","[great story, heartwarming and very current wi...",138392,"[[great, story, ,, heartwarming, and, very, cu...",2467455,"[[(great, JJ), (story, NN), (,, ,), (heartwarm...",2467455,"[[(story, NN), (circle, NN), (title, NN), (rea...",302541


In [16]:
cleaned_words = df_grouped['cleaned_words'][0]
cleaned_words

[[('story', 'NN'),
  ('circle', 'NN'),
  ('title', 'NN'),
  ('reader', 'NN'),
  ('girth', 'NN'),
  ('spacing', 'NN'),
  ('school', 'NN'),
  ('class', 'NN'),
  ('read', 'NN')],
 [('novel', 'NN')],
 [('reader', 'NN'), ('world', 'NN'), ('weather', 'NN'), ('turmoil', 'NN')],
 [('man', 'NN'), ('world', 'NN'), ('technicolor', 'NN')],
 [('literature', 'NN')],
 [('story', 'NN'), ('meaning', 'NN')],
 [('epitome', 'NN'), ('courage', 'NN'), ('strength', 'NN'), ('wisdom', 'NN')],
 [('chapter', 'NN'), ('perspective', 'NN')],
 [('thing', 'NN'), ('love', 'NN'), ('concern', 'NN')],
 [('graduation', 'NN'), ('part', 'NN')],
 [('heart', 'NN'), ('scene', 'NN'), ('back', 'NN'), ('disease', 'NN')],
 [('lesson', 'NN')],
 [('professor', 'NN')],
 [('bound', 'NN')],
 [('surprise', 'NN')],
 [('couple', 'NN')],
 [('lot', 'NN')],
 [('beginning', 'NN'), ('kind', 'NN')],
 [('povs', 'NN')],
 [('protagonist', 'NN'), ('character', 'NN')],
 [('part', 'NN')],
 [('addition', 'NN'), ('povs', 'NN'), ('insight', 'NN')],
 [('

## Association Mining Using Apriori

In [17]:
# Apply apriori to NN and NNPs
# min_support set to 0.001 because 0.01 return too few associations
association_rules = apriori(cleaned_words, min_support = 0.001, min_confidence = 0, min_lift = 0)
association_results = list(association_rules)

In [19]:
association_results

[RelationRecord(items=frozenset({('ability', 'NN')}), support=0.002002002002002002, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({('ability', 'NN')}), confidence=0.002002002002002002, lift=1.0)]),
 RelationRecord(items=frozenset({('acceptance', 'NN')}), support=0.0013378763378763378, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({('acceptance', 'NN')}), confidence=0.0013378763378763378, lift=1.0)]),
 RelationRecord(items=frozenset({('action', 'NN')}), support=0.0025602525602525603, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({('action', 'NN')}), confidence=0.0025602525602525603, lift=1.0)]),
 RelationRecord(items=frozenset({('adaptation', 'NN')}), support=0.0015207515207515208, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({('adaptation', 'NN')}), confidence=0.0015207515207515208, lift=1.0)]),
 RelationRecord(items=frozenset({('addition', 'NN')}

In [21]:
# Filter associations of minimum length 2
association_results = list(filter(lambda x: len(x.items) > 1, association_results))

In [22]:
print(len(association_results))
#print(association_results[0])

# print confidence of the 2nd item
#print(association_results[0][2][0][2])

# print lift of the 2nd item
#print(association_results[0][2][0][3])

95


In [25]:
# Print results of apriori
for item in association_results:
    
    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + str(items[0]) + " -> " + str(items[1]))

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

#     print("Confidence: " + str(item[2][0][2]))
#     print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule: ('adult', 'NN') -> ('child', 'NN')
Support: 0.0019346269346269347
Rule: ('story', 'NN') -> ('adult', 'NN')
Support: 0.0011068761068761068
Rule: ('story', 'NN') -> ('age', 'NN')
Support: 0.0012993762993762994
Rule: ('spoiler', 'NN') -> ('alert', 'NN')
Support: 0.0039751289751289755
Rule: ('author', 'NN') -> ('story', 'NN')
Support: 0.0020693770693770695
Rule: ('way', 'NN') -> ('author', 'NN')
Support: 0.0010683760683760685
Rule: ('elephant', 'NN') -> ('baby', 'NN')
Support: 0.00154000154000154
Rule: ('story', 'NN') -> ('bit', 'NN')
Support: 0.0014437514437514439
Rule: ('boy', 'NN') -> ('face', 'NN')
Support: 0.0010491260491260492
Rule: ('life', 'NN') -> ('boy', 'NN')
Support: 0.001414876414876415
Rule: ('boy', 'NN') -> ('school', 'NN')
Support: 0.0014341264341264342
Rule: ('boy', 'NN') -> ('story', 'NN')
Support: 0.003984753984753985
Rule: ('boy', 'NN') -> ('time', 'NN')
Support: 0.0012608762608762609
Rule: ('boy', 'NN') -> ('tree', 'NN')
Support: 0.003667128667128667
Rule: ('year

In [24]:
# Extract association mining results into a pandas df

results = []
for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    
    value0 = str(items[0])
    value1 = str(items[1])
    
    #second value of the inner list
    value2 = str(item[1])[:7]
    
    #third index of the list located at 0th
    #of the third index of the inner list
    value3 = str(item[2][0][2])[:7]
    value4 = str(item[2][0][3])[:7]
    
    rows = (value0, value1, value2, value3, value4)
    results.append(rows)
    
labels = ['Word1','Word2','Support','Confidence','Lift']
df_assoc = pd.DataFrame.from_records(results, columns = labels).sort_values(by=['Support','Confidence'], ascending=False)

print(df_assoc)

                Word1             Word2  Support Confidence Lift
86      ('way', 'NN')   ('story', 'NN')  0.00406    0.00406  1.0
11      ('boy', 'NN')   ('story', 'NN')  0.00398    0.00398  1.0
3   ('spoiler', 'NN')   ('alert', 'NN')  0.00397    0.00397  1.0
49     ('life', 'NN')   ('story', 'NN')  0.00380    0.00380  1.0
13      ('boy', 'NN')    ('tree', 'NN')  0.00366    0.00366  1.0
..                ...               ...      ...        ...  ...
55      ('que', 'NN')     ('los', 'NN')  0.00102    0.00102  1.0
65    ('story', 'NN')  ('person', 'NN')  0.00102    0.00102  1.0
92    ('world', 'NN')    ('time', 'NN')  0.00102    0.00102  1.0
93     ('year', 'NN')    ('time', 'NN')  0.00101    0.00101  1.0
33     ('life', 'NN')  ('family', 'NN')  0.00100    0.00100  1.0

[95 rows x 5 columns]


In [None]:
# Export association mining results into Excel
# determining the name of the file
file_name = 'Association_mining_results.xlsx'
  
# saving the excel
df_assoc.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')