In [1]:
import pandas as pd
import numpy as np
import math

In [4]:
transaction_df = pd.read_csv('GroceryStoreDataSet.csv')
transaction_df

Unnamed: 0,"MILK,BREAD,BISCUIT"
0,"BREAD,MILK,BISCUIT,CORNFLAKES"
1,"BREAD,TEA,BOURNVITA"
2,"JAM,MAGGI,BREAD,MILK"
3,"MAGGI,TEA,BISCUIT"
4,"BREAD,TEA,BOURNVITA"
5,"MAGGI,TEA,CORNFLAKES"
6,"MAGGI,BREAD,TEA,BISCUIT"
7,"JAM,MAGGI,BREAD,TEA"
8,"BREAD,MILK"
9,"COFFEE,COKE,BISCUIT,CORNFLAKES"


In [5]:
transaction_df.index.rename('TID', inplace=True)
transaction_df.rename(columns={'MILK,BREAD,BISCUIT' : 'item_list'}, inplace=True)

In [6]:
trans_df = transaction_df.item_list.str.split(',')
trans_df

TID
0      [BREAD, MILK, BISCUIT, CORNFLAKES]
1                 [BREAD, TEA, BOURNVITA]
2               [JAM, MAGGI, BREAD, MILK]
3                   [MAGGI, TEA, BISCUIT]
4                 [BREAD, TEA, BOURNVITA]
5                [MAGGI, TEA, CORNFLAKES]
6            [MAGGI, BREAD, TEA, BISCUIT]
7                [JAM, MAGGI, BREAD, TEA]
8                           [BREAD, MILK]
9     [COFFEE, COKE, BISCUIT, CORNFLAKES]
10    [COFFEE, COKE, BISCUIT, CORNFLAKES]
11             [COFFEE, SUGER, BOURNVITA]
12                  [BREAD, COFFEE, COKE]
13                [BREAD, SUGER, BISCUIT]
14            [COFFEE, SUGER, CORNFLAKES]
15              [BREAD, SUGER, BOURNVITA]
16                 [BREAD, COFFEE, SUGER]
17                 [BREAD, COFFEE, SUGER]
18        [TEA, MILK, COFFEE, CORNFLAKES]
Name: item_list, dtype: object

In [7]:
def prune(data,supp):
    
    df = data[data.supp_count >= supp] 
    return df
    
def count_itemset(transaction_df, itemsets):
    
    count_item = {}
    for item_set in itemsets:
        set_A = set(item_set)
        for row in trans_df:
            set_B = set(row)
        
            if set_B.intersection(set_A) == set_A: 
                if item_set in count_item.keys():
                    count_item[item_set] += 1
                
                else:
                    count_item[item_set] = 1
                
    data = pd.DataFrame()
    data['item_sets'] = count_item.keys()
    data['supp_count'] = count_item.values()
    
    return data

def count_item(trans_items):
    
    count_ind_item = {}
    for row in trans_items:
        for i in range(len(row)):
            if row[i] in count_ind_item.keys():
                count_ind_item[row[i]] += 1
            else:
                count_ind_item[row[i]] = 1
    
    data = pd.DataFrame()
    data['item_sets'] = count_ind_item.keys()
    data['supp_count'] = count_ind_item.values()
    data = data.sort_values('item_sets')
    return data


def join(list_of_items):
    itemsets = []
    i = 1
    for entry in list_of_items:
        proceding_items = list_of_items[i:]
        for item in proceding_items:
            if(type(item) is str):
                if entry != item:
                    tuples = (entry, item)
                    itemsets.append(tuples)
            else:
                if entry[0:-1] == item[0:-1]:
                    tuples = entry+item[1:]
                    itemsets.append(tuples)
        i = i+1
    if(len(itemsets) == 0):
        return None
    return itemsets


In [8]:

def apriori(trans_data,supp=3, con=0.5):
    freq = pd.DataFrame()
    
    df = count_item(trans_data)
   
    while(len(df) != 0):
        
        df = prune(df, supp)
    
        if len(df) > 1 or (len(df) == 1 and int(df.supp_count >= supp)):
            freq = df
        
        itemsets = join(df.item_sets)
    
        if(itemsets is None):
            return freq
    
        df = count_itemset(trans_data, itemsets)
    return df

In [10]:
freq_item_sets = apriori(trans_df, 3)
freq_item_sets

Unnamed: 0,item_sets,supp_count
0,"(BISCUIT, BREAD)",3
3,"(BISCUIT, CORNFLAKES)",3
8,"(BOURNVITA, BREAD)",3
12,"(BREAD, COFFEE)",3
15,"(BREAD, MAGGI)",3
16,"(BREAD, MILK)",3
17,"(BREAD, SUGER)",4
18,"(BREAD, TEA)",4
19,"(COFFEE, COKE)",3
20,"(COFFEE, CORNFLAKES)",4


In [11]:
def calculate_conf(value1, value2):
    return round(int(value1)/int(value2) * 100, 2)

In [9]:
def strong_rules(freq_item_sets, threshold):

    confidences = {}
    for row in freq_item_sets.item_sets:
        for i in range(len(row)):
            for j in range(len(row)):
                 if i != j:
                    tuples = (row[i], row[j])
                    conf = calculate_conf(freq_item_sets[freq_item_sets.item_sets == row].supp_count, count_item(trans_df)[count_item(trans_df).item_sets == row[i]].supp_count)
                    confidences[tuples] = conf

        
    conf_df = pd.DataFrame()
    conf_df['item_set'] = confidences.keys()
    conf_df['confidence'] = confidences.values()

    return conf_df[conf_df.confidence >= threshold]

    

In [10]:
strong_rules(freq_item_sets, 50.0)

Unnamed: 0,item_set,confidence
1,"(SUGER, BREAD)",66.67
3,"(TEA, BREAD)",57.14
4,"(COFFEE, CORNFLAKES)",50.0
5,"(CORNFLAKES, COFFEE)",66.67
6,"(COFFEE, SUGER)",50.0
7,"(SUGER, COFFEE)",66.67
8,"(MAGGI, TEA)",80.0
9,"(TEA, MAGGI)",57.14


##### Rules with confidence level >= 50.0%

In [11]:
from functools import reduce # Valid in Python 2.6+, required in Python 3
import operator

def interesting_rules(freq_item_sets):
    
    lifts = {}
    prob_of_items = []
  
    for row in freq_item_sets.item_sets:
        num_of_items = len(row)
        
        prob_all = freq_item_sets[freq_item_sets.item_sets == row].supp_count / 18
        for i in range(num_of_items):
            prob_of_items.append(count_item(trans_df)[count_item(trans_df).item_sets == row[i]].supp_count / 18)
        
        lifts[row] = round(float(prob_all / reduce(operator.mul, (np.array(prob_of_items)), 1)), 2)
        
        prob_of_items = []
        
    
    lifts_df = pd.DataFrame()
    lifts_df['Rules'] = lifts.keys()
    lifts_df['lift'] = lifts.values()
    
    return lifts_df
    

In [12]:
int_rules = interesting_rules(freq_item_sets)
int_rules

Unnamed: 0,Rules,lift
0,"(BREAD, SUGER)",1.0
1,"(BREAD, TEA)",0.86
2,"(COFFEE, CORNFLAKES)",1.5
3,"(COFFEE, SUGER)",1.5
4,"(MAGGI, TEA)",2.06


##### (BREAD, TEA) is the most selling combination of items where if a person buys TEA then he/she will most likely to buy BREAD with it