In [2]:
#Function to generate association rules from frequent itemsets
from mlxtend.frequent_patterns import association_rules

In [3]:
#Import Books dataset for analysis
import pandas as pd
import numpy as np

filepath = 'https://raw.githubusercontent.com/jshumway0475/Predictive-Analytics/main/Books.csv'
books = pd.read_csv(filepath)

In [4]:
#Review data
print(books.info())
print()
print(books.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1878 entries, 0 to 1877
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ChildBks  1878 non-null   object
 1   YouthBks  1878 non-null   object
 2   CookBks   1878 non-null   object
 3   DoItYBks  1878 non-null   object
 4   RefBks    1878 non-null   object
 5   ArtBks    1878 non-null   object
 6   GeogBks   1878 non-null   object
dtypes: object(7)
memory usage: 102.8+ KB
None

  ChildBks YouthBks CookBks DoItYBks RefBks ArtBks GeogBks
0      yes       no     yes       no    yes    yes      no
1       no      yes     yes      yes    yes    yes     yes
2      yes      yes     yes      yes    yes    yes     yes
3       no       no      no      yes     no    yes      no
4      yes      yes      no      yes    yes    yes      no


In [5]:
#Prepare dataframe for modeling
books = books.replace('no', False)
books = books.replace('yes', True)
books.head()

Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks
0,True,False,True,False,True,True,False
1,False,True,True,True,True,True,True
2,True,True,True,True,True,True,True
3,False,False,False,True,False,True,False
4,True,True,False,True,True,True,False


In [6]:
#Create association rules model
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets_books = apriori(books, min_support = 0.6, use_colnames = True)
books_ar = association_rules(frequent_itemsets_books, metric = 'confidence', min_threshold = 0.7)
books_ar.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(YouthBks),(RefBks),0.752929,0.78115,0.63099,0.838048,1.072839,0.04284,1.351326
1,(RefBks),(YouthBks),0.78115,0.752929,0.63099,0.807771,1.072839,0.04284,1.285297
2,(YouthBks),(ArtBks),0.752929,0.758253,0.612354,0.813296,1.072591,0.041443,1.294809
3,(ArtBks),(YouthBks),0.758253,0.752929,0.612354,0.807584,1.072591,0.041443,1.28405
4,(YouthBks),(GeogBks),0.752929,0.728435,0.600106,0.79703,1.094168,0.051647,1.337957


**Sample calculations to verify the output (using row 0, antecedent: RefBks consequent: YouthBks)**  

*Number of Antecedant Instances = antecedant support * Count(Transactions)*  
1. Number of Antecedant Instances (row 0) = 0.781150 * 1878 = 1467 which is correct!  
  
*Number of Consequent Instances = consequent support * Count(Transactions)*  
1. Number of Consequent Instances (row 0) = 0.752929 * 1878 = 1414 which is correct!  
  
*Rules Support = Count(Support AUC rules) / Count(Transactions)*  
1. Rules Support (row 0) = 0.630990 = Count(Support AUC rules) / 1878  
2. Count(Support AUC rules) = 0.630990 * 1878 = 1185 which is correct!  
  
*Lift = Confidence / Benchmark Confidence*  
*Confidence = Count(Support AUC rules) / Number of Antecedant Instances*  
*Benchmark Confidence = Number of Consequent Instances / Count(Transactions)*   
1. Confidence (row 0) = 1185 / 1467 = 0.807771 which is correct!  
2. Benchmark Confidence (row 0) = 1414 / 1878 = 0.752929 which is correct!  
3. Lift (row 0) = 0.807771 / 0.752929 = 1.072839 which is correct!

**Sample calculations to verify the output (using row 1, antecedent: YouthBks consequent: RefBks)**  

*Number of Antecedant Instances = antecedant support * Count(Transactions)*  
1. Number of Antecedant Instances (row 1) = 0.752929 * 1878 = 1414 which is correct!  
  
*Number of Consequent Instances = consequent support * Count(Transactions)*  
1. Number of Consequent Instances (row 1) = 0.781150 * 1878 = 1467 which is correct!  
  
*Rules Support = Count(Support AUC rules) / Count(Transactions)*  
1. Rules Support (row 1) = 0.630990 = Count(Support AUC rules) / 1878  
2. Count(Support AUC rules) = 0.630990 * 1878 = 1185 which is correct!  
  
*Lift = Confidence / Benchmark Confidence*  
*Confidence = Count(Support AUC rules) / Number of Antecedant Instances*  
*Benchmark Confidence = Number of Consequent Instances / Count(Transactions)*   
1. Confidence (row 1) = 1185 / 1414 = 0.838048 which is correct!  
2. Benchmark Confidence (row 1) = 1467 / 1878 = 0.781150 which is correct!  
3. Lift (row 1) = 0.838048 / 0.781150 = 1.072839 which is correct!

In [7]:
#Filter rules to the ArtBks consequent
fname = 'ArtBks'
books_ar_filtered = books_ar[books_ar['consequents'].astype(str).str.contains(fname)]
books_ar_filtered.sort_values('lift', ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
14,(GeogBks),(ArtBks),0.728435,0.758253,0.613951,0.842836,1.11155,0.061613,1.538183
9,(DoItYBks),(ArtBks),0.717785,0.758253,0.600639,0.836795,1.103583,0.056376,1.481247
2,(YouthBks),(ArtBks),0.752929,0.758253,0.612354,0.813296,1.072591,0.041443,1.294809
10,(RefBks),(ArtBks),0.78115,0.758253,0.630458,0.807089,1.064406,0.038148,1.253153


In [8]:
#Import StatisticsComCourses dataset for analysis
filepath2 = 'https://raw.githubusercontent.com/jshumway0475/Predictive-Analytics/main/StatisticsComCourses.csv'
StatisticsComCourses = pd.read_csv(filepath2)

In [9]:
#Review data
print(StatisticsComCourses.info())
print()
print(StatisticsComCourses.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Intro       365 non-null    object
 1   DataMining  365 non-null    object
 2   Survey      365 non-null    object
 3   Cat Data    365 non-null    object
 4   Regression  365 non-null    object
 5   Forecast    365 non-null    object
 6   DOE         365 non-null    object
 7   SW          365 non-null    object
dtypes: object(8)
memory usage: 22.9+ KB
None

  Intro DataMining Survey Cat Data Regression Forecast  DOE   SW
0    no         no    yes      yes        yes      yes  yes  yes
1   yes        yes     no      yes        yes      yes  yes  yes
2   yes         no    yes       no         no      yes  yes   no
3    no        yes    yes      yes        yes      yes  yes  yes
4    no         no    yes      yes        yes      yes  yes  yes


In [10]:
#Prepare dataframe for modeling
StatisticsComCourses = StatisticsComCourses.replace('no', False)
StatisticsComCourses = StatisticsComCourses.replace('yes', True)
StatisticsComCourses.head()

Unnamed: 0,Intro,DataMining,Survey,Cat Data,Regression,Forecast,DOE,SW
0,False,False,True,True,True,True,True,True
1,True,True,False,True,True,True,True,True
2,True,False,True,False,False,True,True,False
3,False,True,True,True,True,True,True,True
4,False,False,True,True,True,True,True,True


In [11]:
#Create association rules model
frequent_itemsets_StatisticsComCourses = apriori(StatisticsComCourses, min_support = 0.6, use_colnames = True)
StatisticsComCourses_ar = association_rules(frequent_itemsets_StatisticsComCourses, 
                                            metric = 'confidence', min_threshold = 0.8)
StatisticsComCourses_ar.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(DataMining),(Survey),0.821918,0.813699,0.665753,0.81,0.995455,-0.00304,0.980534
1,(Survey),(DataMining),0.813699,0.821918,0.665753,0.818182,0.995455,-0.00304,0.979452
2,(Cat Data),(DataMining),0.791781,0.821918,0.663014,0.83737,1.0188,0.012235,1.095016
3,(DataMining),(Cat Data),0.821918,0.791781,0.663014,0.806667,1.0188,0.012235,1.076996
4,(DataMining),(Regression),0.821918,0.791781,0.657534,0.8,1.010381,0.006755,1.041096


**Sample calculations to verify the output (using row 0, antecedent: Survey consequent: DataMining)**  

*Number of Antecedant Instances = antecedant support * Count(Transactions)*  
1. Number of Antecedant Instances (row 0) = 0.813699 * 365 = 297 which is correct!  
  
*Number of Consequent Instances = consequent support * Count(Transactions)*  
1. Number of Consequent Instances (row 0) = 0.821918 * 365 = 300 which is correct!  
  
*Rules Support = Count(Support AUC rules) / Count(Transactions)*  
1. Rules Support (row 0) = 0.665753	 = Count(Support AUC rules) / 365  
2. Count(Support AUC rules) = 0.665753 * 365 = 243 which is correct!  
  
*Lift = Confidence / Benchmark Confidence*  
*Confidence = Count(Support AUC rules) / Number of Antecedant Instances*  
*Benchmark Confidence = Number of Consequent Instances / Count(Transactions)*   
1. Confidence (row 0) = 243 / 297 = 0.818182 which is correct!  
2. Benchmark Confidence (row 0) = 300 / 365 = 0.821918 which is correct!  
3. Lift (row 0) = 0.818182 / 0.821918 = 0.995455 which is correct!

In [12]:
#Filter rules to the DataMining consequent
fname = 'DataMining'
StatisticsComCourses_ar_filtered = StatisticsComCourses_ar[StatisticsComCourses_ar['consequents'].astype(str).str.contains(fname)]
StatisticsComCourses_ar_filtered.sort_values('lift', ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
6,(Forecast),(DataMining),0.860274,0.821918,0.723288,0.840764,1.02293,0.016213,1.118356
2,(Cat Data),(DataMining),0.791781,0.821918,0.663014,0.83737,1.0188,0.012235,1.095016
5,(Regression),(DataMining),0.791781,0.821918,0.657534,0.83045,1.010381,0.006755,1.050321
10,(SW),(DataMining),0.778082,0.821918,0.638356,0.820423,0.998181,-0.001163,0.991673
1,(Survey),(DataMining),0.813699,0.821918,0.665753,0.818182,0.995455,-0.00304,0.979452
8,(DOE),(DataMining),0.827397,0.821918,0.673973,0.81457,0.99106,-0.00608,0.960372
