<a href="https://colab.research.google.com/github/harini9804/big-data-assignment/blob/master/Big_Data_Assignment_Rules_and_Patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Rules and Patterns:

Using the plants dataset processed in Part A, in this notebook we shall implement Apriori algorithm. Further, we shall validate the "interesting-ness" of the rules and petterns generated using various measures.

In [0]:
import pandas as pd
import numpy as np
!pip install apyori
from apyori import apriori


Collecting apyori
  Downloading https://files.pythonhosted.org/packages/5e/62/5ffde5c473ea4b033490617ec5caa80d59804875ad3c3c57c0976533a21a/apyori-1.1.2.tar.gz
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-cp36-none-any.whl size=5975 sha256=aec14b66a68359befbfda9f3684a934935751b985e738feacce0a15a3d0a8edb
  Stored in directory: /root/.cache/pip/wheels/5d/92/bb/474bbadbc8c0062b9eb168f69982a0443263f8ab1711a8cad0
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [0]:
dataset = pd.read_csv('/content/plants_data.csv')
dataset.head()

Unnamed: 0,Symbol,National Common Name,Author,Family
0,0,0,1,0
1,1,1,0,0
2,1,2,1,0
3,2,2,2,0
4,3,3,0,0


In [0]:
# Check for the null values in the dataset and if present, remove them.
transactions = []
for index, data in dataset.iterrows():
    transaction = pd.Series.tolist(data[~pd.isnull(data)])
    transactions.append(set(transaction))

In [0]:
_association_rules = apriori(transactions, min_support=0.0025, min_confidence=0.7, min_lift=0, min_length=2)

association_rules = []

# Print the association rule
print("Association Rules: Min Support = 0.25%, Min Confidence = 70%", end='\n\n')
for association_rule in _association_rules:
    
    itemset = set([item for item in association_rule[0]])
    support = association_rule[1]
    
    precedent = set([item for item in association_rule[2][0][0]])
    antecedent = set([item for item in association_rule[2][0][1]])
    
    confidence = association_rule[2][0][2]
    lift = association_rule[2][0][3]
    
    association_rules.append((precedent, antecedent))
    
    print("{} => {}".format(precedent, antecedent))
    print("Support = {}, Confidence = {}, Lift = {}".format(support, confidence, lift), end='\n\n')

Association Rules: Min Support = 0.25%, Min Confidence = 70%

{633} => {16}
Support = 0.002679006332196785, Confidence = 1.0, Lift = 6.1169459962756045

{1101} => {16}
Support = 0.002618119824646858, Confidence = 1.0, Lift = 6.1169459962756045

{4880} => {117}
Support = 0.002800779347296639, Confidence = 0.851851851851852, Lift = 7.21176021382207

{6035} => {135}
Support = 0.002679006332196785, Confidence = 0.9777777777777777, Lift = 36.58091622374082

{6386} => {146}
Support = 0.004262055528494886, Confidence = 0.9859154929577465, Lift = 29.387796835459216



In [0]:
class ARMInterestMeasures:
    
    def __init__(self, transactions, antecedent, consequent):
        ''' Compute necessary parameters involving antecedent, consequent from transactions '''
        
        self.transactions = transactions
        self.antecedent = antecedent
        self.consequent = consequent
        
        self.n_transactions = len(transactions) # Number of transactions in the database
        
        self.n_antecedent_present_trans = 0 # Number of transactions that contain antecedent
        self.n_consequent_present_trans = 0 # Number of transactions thar contain consequent
        self.n_consequent_absent_trans = 0 # Number of transactions that oppose the consequent
        self.n_support_trans = 0 # Number of transactions that support rule (A ^ B)
        self.n_oppose_trans = 0 # Number of transactions that oppose the rule (A ^ !B)
        
        for transaction in transactions:
            
            antecedent_present = self.antecedent <= transaction # Check if antecedent is subset of transaction
            consequent_present = self.consequent <= transaction # Check if consequent is subset of transaction
            
            if antecedent_present:
                self.n_antecedent_present_trans += 1
                
            if consequent_present:
                self.n_consequent_present_trans += 1
            else:
                self.n_consequent_absent_trans += 1
                
            if antecedent_present and consequent_present:
                self.n_support_trans += 1
                
            if antecedent_present and not consequent_present:
                self.n_oppose_trans += 1
                
                
    def computeSupport(self):
        ''' Compute the Support of an association rule A -> B
        
        Formula: n{A U B}/n
        Range: [0, 1]
        Intreprtation: Measure of popularity of the itemset, as measured by the proportion of transactions in which the 
        itemset {AUB} appears.
        '''
            
        return self.n_support_trans/self.n_transactions
    
    def computeConfidence(self):
        ''' Compute the confidence of an association rule A->B
        
        Formula: n{A U B}/n{A}
        Range: [0, 1]
        Intrepretation: How likey B is purchased, when A is purchased, as measured by the proportion of transactions with 
        items A in which items B also appears.
        '''
        
        return self.n_support_trans/self.n_antecedent_present_trans
    
    def computeLift(self):
        ''' Compute the lift (or interest) of an association rule A->B
        
        Formula: support{A U B}/(support{A}.support{B})
        Intrepretation: How likely item B is purchased, when item A is purchased, while controlling for how popular B 
        already is.
            
            Lift = 1 => No Association b/w items
            Lift > 1 => Item B is likely to be brought when item A is brought
            Lift < 1 => Item B is unlikely to be brought when item A is brought
            
        Drawbacks:
            - Rules that hold 100% of time, may not have highest possible lift.
            - Lift is symmetric, i.e., Lift(A->B) = Lift(B->A)
        '''
        
        return (self.n_support_trans/self.n_transactions)/((self.n_antecedent_present_trans/self.n_transactions)*(self.n_consequent_present_trans/self.n_transactions))
    
    def computeConviction(self):
        ''' Compute the conviction of the rule A->B or !(A & !B)
        
        Formula: ( support(A) . support(!B) )/support(A and !B)
        Range: [0, inf)
        Intrepretation: Measure of Implication
        '''
        div = (self.n_oppose_trans/self.n_transactions)

        if div == 0:
          result = 0
        else:
          result = ((self.n_antecedent_present_trans/self.n_transactions)*(self.n_consequent_absent_trans/self.n_transactions))/ div
        return result
    
    def computeLeverage(self):
        ''' Compute the leverage (or Piatetsky-Shapiro) of the rule A->B
        
        Formula: Support(A,B) - Support(A).Support(B)
        Intrepretation: Is the 'proportion of additional elements' covered by both the premise and consequence 'above the 
        expected' if indepedent.
        '''
        
        return (self.n_support_trans/self.n_transactions) - (self.n_antecedent_present_trans/self.n_transactions)*(self.n_consequent_present_trans/self.n_transactions)
    
    def computeCoverage(self):
        ''' Compute the coverage of the rule  A->B
        
        Formula: support(A)
        Range: [0, 1]
        '''
        
        return self.n_antecedent_present_trans/self.n_transactions
    
    def computeCosineSimilarity(self):
        ''' Compute the cosine similarity of the rule A->B 
        
        Formula: Support(A,B)/sqrt(Support(A).Support(B))
        '''
        
        return (self.n_support_trans/self.n_transactions)/np.sqrt((self.n_antecedent_present_trans/self.n_transactions)*(self.n_consequent_present_trans/self.n_transactions))

In [0]:
for precedent, antecedent in association_rules:
    
    print("{} => {}".format(precedent, antecedent))
    
    arm_interest_measures = ARMInterestMeasures(transactions, precedent, antecedent)
    print("Support = {}".format(arm_interest_measures.computeSupport()))
    print("Confidence = {}".format(arm_interest_measures.computeConfidence()))
    print("Lift = {}".format(arm_interest_measures.computeLift()))
    print("Conviction = {}".format(arm_interest_measures.computeConviction()))
    print("Leverage = {}".format(arm_interest_measures.computeLeverage()))
    print("Coverage = {}".format(arm_interest_measures.computeCoverage()))
    print("Cosine Similarity = {}".format(arm_interest_measures.computeCosineSimilarity()))
    
    print()

{633} => {16}
Support = 0.002679006332196785
Confidence = 1.0
Lift = 6.1169459962756045
Conviction = 0
Leverage = 0.0022410416462525346
Coverage = 0.002679006332196785
Cosine Similarity = 0.12801303471806344

{1101} => {16}
Support = 0.002618119824646858
Confidence = 1.0
Lift = 6.116945996275605
Conviction = 0
Leverage = 0.002190108881564977
Coverage = 0.002618119824646858
Cosine Similarity = 0.1265499805576571

{4880} => {117}
Support = 0.002800779347296639
Confidence = 0.8518518518518519
Lift = 7.211760213822069
Conviction = 5.952691183633706
Leverage = 0.002412416553158162
Coverage = 0.0032878714076960544
Cosine Similarity = 0.14212159957067907

{6035} => {135}
Support = 0.002679006332196785
Confidence = 0.9777777777777777
Lift = 36.58091622374082
Conviction = 43.797187043351194
Leverage = 0.0026057712520306375
Coverage = 0.002739892839746712
Cosine Similarity = 0.31305032534875565

{6386} => {146}
Support = 0.004262055528494886
Confidence = 0.9859154929577465
Lift = 29.387796835459