In [15]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import networkx as nx
import warnings
from itertools import permutations

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [16]:
class Columns:
    INVOICE_NO = 'invoice_no'
    STOCK_CODE = 'stock_code'
    DESCRIPTION = 'description'

class MetricType:
    SUPPORT = 'support'
    CONFIDENCE = 'confidence'
    LIFT = 'lift'

class RunResults(list):
    
    def __init__(self):
        super().__init__(self)
        self.statistics = Statistics()
        self.items = []
            
    
class SingleRunResults:
    
    def __init__(self):
        self.frequent_itemsets = None
        self.rules = None

class StatisticRecord:
    
    def __init__(self):
        self.apriori_min_support = -1
        self.apriori_max_len = -1
        self.rule_metric = ''
        self.rule_min_threshold = -1
        self.max_antecedent_support  = -1
        self.min_consequent_support = -1
        self.zhang = -1
        self.frequent_datasets = -1
        self.rules = -1
        self.configuration_id = ''
        
class Statistics:
    
    _column_names = ['apr_min_support', 'apr_max_len', 'rule_metric', \
                     'rule_min_threshold', 'max_support_A', \
                     'min_support_C', 'max_zhang', 'freq_ds', 'rules', 'configuration_id']
    
    def __init__(self):
        
        self._df = pd.DataFrame(columns=Statistics._column_names)
        self._df.set_index('configuration_id')
    
    def add_statistic(self, record):
        
        try:
            values = [record.apriori_min_support, record.apriori_max_len, record.rule_metric, \
                  record.rule_min_threshold, record.max_antecedent_support, \
                  record.min_consequent_support, record.zhang, record.frequent_datasets, \
                  record.rules, record.configuration_id]
        
            self._df.loc[len(self._df.index)] = values
        except Exception as ex:
            print(f'Unable to add a statistic {values}')
            print(f'Error args are {ex.args}\n')
    
    def get(self):
        return self._df
    
class Apriori:
    
    def __init__(self, configuration, df):
        self.configuration = configuration
        self.df = df
       
    def zhangs_rule(self, rules):
        PAB = rules['support'].copy()
        PA = rules['antecedent support'].copy()
        PB = rules['consequent support'].copy()
        NUMERATOR = PAB - PA*PB
        DENOMINATOR = np.max((PAB*(1-PA).values,PA*(PB-PAB).values), axis = 0)
        return NUMERATOR / DENOMINATOR
    
    def run(self):
        results = RunResults() 
        
        for option in self.configuration.options:
            run_results = SingleRunResults()
            
            apriori_min_support = option.min_support
            apriori_max_len = option.max_len
            
            run_results.frequent_itemsets = apriori(self.df, \
                min_support = apriori_min_support, \
                max_len = apriori_max_len, use_colnames = True)
            
            rule_metric = option.association_rule.metric
            rule_min_threshold = option.association_rule.min_threshold
            
            rules =  association_rules( \
                run_results.frequent_itemsets, \
                metric = rule_metric, \
                min_threshold = rule_min_threshold)
        
            max_antecedent_support = rules['antecedent support'].max()
            min_consequent_support = rules['consequent support'].min()
            rules['zhang'] = self.zhangs_rule(rules)
        
            if not (option.filter is None):
                
                if not (option.filter.antecedent_support is None):
                    rules = rules[rules['antecedent support'] > option.filter.antecedent_support]
                    
                if not (option.filter.consequent_support is None):
                    rules = rules[rules['consequent support'] <  option.filter.consequent_support]
                
                if not (option.filter.zhang is None):
                    rules = rules[rules['zhang'] > option.filter.zhang]
                
            run_results.rules = rules
            results.items.append(run_results)
            
            statistic = StatisticRecord()
            statistic.apriori_min_support = apriori_min_support
            statistic.apriori_max_len = apriori_max_len
            statistic.rule_metric = rule_metric
            statistic.rule_min_threshold = rule_min_threshold
            statistic.max_antecedent_support  = max_antecedent_support
            statistic.min_consequent_support = min_consequent_support
            statistic.zhang = rules['zhang'].max()
            statistic.frequent_datasets = len(run_results.frequent_itemsets)
            statistic.rules = len(run_results.rules)
            statistic.configuration_id = option.id
            
            results.statistics.add_statistic(statistic)
            
        return results
        
class AprioriConfiguration:
    
    def __init__(self):
        self.options = []
        
    def get_option(self, option_id):
        
        for o in self.options:
            if o.id == option_id:
                return o
        return None
    
class AprioriOption:
    _id_counter = 0
    
    def __init__(self, min_support, max_len):
        self.min_support = min_support
        self.max_len = max_len
        self.association_rule = None
        self.filter = None
        self.id = AprioriOption.next_id()
    
    def next_id():
        next_id = AprioriOption._id_counter + 1
        AprioriOption._id_counter = next_id
        return next_id
    
    def info(self):
        print("Configuration Info:\n")
        print(f"configuration_id = {self.id}")
        print(f"min_support = {self.min_support}")
        print(f"max_len = {self.max_len}")
        
        if(not (self.association_rule is None)):
            print(f"association_rule_metric = {self.association_rule.metric}")
            print(f"association_rule_min_threshold = {self.association_rule.min_threshold}")
        
        if(not (self.filter is None)):
            print(f"filter_antecedent_support = {self.filter.antecedent_support}")
            print(f"filter_consequent_support = {self.filter.consequent_support}")
            print(f"filter_zhang = {self.filter.zhang}")
    
class FilterOption:
    
    def __init__(self):
        self.antecedent_support = None
        self.consequent_support = None
        self.zhang = None
        
class RAOption:
    
    def __init__(self, metric, min_threshold):
        self.metric = metric
        self.min_threshold = min_threshold
    
        
class Aggregate:
    
    def __init__(self, df):
        self.df = df
    
    #Select the column headers for sign items
    def apply(self, item):
        headers = []
        
        for i in self.df.columns:
            word_list = self.__word_list(str(i).lower())
            if item in word_list:
                headers.append(i)
            
            
        # Select columns for this items
        item_columns = self.df[headers]
            
        # Return category of aggregated items
        return item_columns.sum(axis = 1) >= 1.0
            
    def __word_list(self, value):
        splited = list(value.split(' '))
        return splited
    

In [17]:
gifts_df = pd.read_csv('../../datasets/market_basket/online_retail.csv')

gifts_df.rename(columns={ \
    'InvoiceNo' : 'invoice_no', \
    'StockCode' : 'stock_code', \
    'Description' : 'description' \
    }, inplace=True)

gifts_df.head()

Unnamed: 0,invoice_no,stock_code,description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)


In [18]:
gifts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227760 entries, 0 to 227759
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   invoice_no   227760 non-null  object
 1   stock_code   227760 non-null  object
 2   description  227404 non-null  object
dtypes: object(3)
memory usage: 5.2+ MB


In [19]:
#Remove leading and trailing characters in the Description column
gifts_df[Columns.DESCRIPTION] = gifts_df[Columns.DESCRIPTION].str.strip()
gifts_df.head()

Unnamed: 0,invoice_no,stock_code,description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)


In [20]:
#Dropping the rows without any invoice number
row_count = len(gifts_df)

gifts_df.dropna(subset=[Columns.INVOICE_NO], inplace=True)
gifts_df[Columns.INVOICE_NO] = gifts_df[Columns.INVOICE_NO].astype('str')

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

gifts_df.dtypes

Row count dropped from 227760 to 227760


invoice_no     object
stock_code     object
description    object
dtype: object

In [21]:
# Dropping all transactions which were done on credit
row_count = len(gifts_df)

filt = ~gifts_df[Columns.INVOICE_NO].str.contains('C')
gifts_df = gifts_df[filt]

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

Row count dropped from 227760 to 224372


In [22]:
print(f'Number of transactions is {len(gifts_df[Columns.INVOICE_NO].unique())}')
print(f'Number of items is {len(gifts_df[Columns.DESCRIPTION].unique())}')

Number of transactions is 8410
Number of items is 3447


In [23]:
# Recover unique InvoiceNo's.
invoice_numbers = gifts_df[Columns.INVOICE_NO].unique()
print(f'{len(invoice_numbers)} unique invoice numbers was found')

8410 unique invoice numbers was found


In [24]:
def extract_transactions(df):
    #Create a basket of items for each transaction
    transactions = []

    i = 1
    for invoice_no in invoice_numbers:
        if i % 500 == 0:
            print(f'{i} invoice numbers were processed')
        
        filt = gifts_df[Columns.INVOICE_NO] == invoice_no
        transaction = list(gifts_df[filt].description.astype(str))
        transactions.append(transaction)
        i = i + 1
    return transactions

In [25]:
item_map_file_path = '../../datasets/outputs/market_basket/gits_item_map.csv'
is_item_map_file_exists = os.path.isfile(item_map_file_path)
is_item_map_file_exists

True

In [26]:
item_map_df = None

if is_item_map_file_exists:
    item_map_df = pd.read_csv(item_map_file_path)
    print('item_map_df already exists and was loaded')
else:
    print('Extracting transactions can take a few minutes\n')
    transactions = extract_transactions(gifts_df)
    
    # Instantiate transaction encoder.
    encoder = TransactionEncoder()

    # One-hot encode transactions.
    item_map = encoder.fit(transactions).transform(transactions)

    # Use unique items as column headers.
    item_map_df = pd.DataFrame(item_map, columns = encoder.columns_).drop('nan', axis=1)

    item_map_df.to_csv(index=False)

    filepath = Path(item_map_file_path)  
    filepath.parent.mkdir(parents=True, exist_ok=True)
    item_map_df.to_csv(filepath)

    print('\nStored item_map_df as a file on disk')
        
# Print onehot header.
item_map_df.head()

item_map_df already exists and was loaded


Unnamed: 0.1,Unnamed: 0,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,...,wet boxes,wet pallet,wet rusty,wet?,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [27]:
#Aggregate class examples
aggregate = Aggregate(item_map_df)

bags = aggregate.apply('bag')
boxes = aggregate.apply('box')
candles = aggregate.apply('candle')

print('Share of Bags: %.2f' % bags.mean())
print('Share of Boxes: %.2f' % boxes.mean())
print('Share of Candles: %.2f' % candles.mean())

Share of Bags: 0.41
Share of Boxes: 0.39
Share of Candles: 0.11


In [28]:
item_map_df.drop(columns=['Unnamed: 0'], inplace=True)
item_map_df.head()

frequent_itemsets = apriori(item_map_df, min_support=0.05, max_len = 3, use_colnames = True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.054697,(60 CAKE CASES VINTAGE CHRISTMAS)
1,0.054459,(ALARM CLOCK BAKELIKE GREEN)
2,0.050535,(ALARM CLOCK BAKELIKE RED)
3,0.069203,(ASSORTED COLOUR BIRD ORNAMENT)
4,0.053983,(BAKING SET 9 PIECE RETROSPOT)


## Configuration

In [29]:
#Setting several apriori and association rules configurations. 
#In a such way, apriori running engine can be tested

configuration = AprioriConfiguration()

option = AprioriOption(min_support=0.04, max_len=3)
option.association_rule = RAOption(metric=MetricType.SUPPORT, min_threshold=0.001)
configuration.options.append(option)

option = AprioriOption(min_support=0.05, max_len=3)
option.association_rule = RAOption(metric=MetricType.SUPPORT, min_threshold=0.002)
configuration.options.append(option)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = RAOption(metric=MetricType.LIFT, min_threshold=1.0)
configuration.options.append(option)

filter_option = FilterOption()
filter_option.antecedent_support = 0.05
filter_option.consequent_support = 0.05
ar_option = RAOption(metric=MetricType.CONFIDENCE, min_threshold=0.4)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = ar_option
configuration.options.append(option)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = ar_option
option.filter = filter_option
configuration.options.append(option)

#Setting a configuration for the best run
#Including filtering and a lift metric as a initial metric
filter_option = FilterOption()
filter_option.zhang = 0.95
filter_option.antecedent_support = 0.05
filter_option.consequent_support = 0.06
ar_option = RAOption(metric=MetricType.LIFT, min_threshold=1.00)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = ar_option
option.filter = filter_option
configuration.options.append(option)

## Algorithm run

In [30]:
apriori_alg = Apriori(configuration, item_map_df)
results = apriori_alg.run()

statistics = results.statistics.get().sort_values(by=['rules'])
statistics

Unnamed: 0,apr_min_support,apr_max_len,rule_metric,rule_min_threshold,max_support_A,min_support_C,max_zhang,freq_ds,rules,configuration_id
4,0.03,2,confidence,0.4,0.106421,0.049227,0.958912,172,1,5
1,0.05,3,support,0.002,0.106421,0.075386,0.945517,50,2,2
0,0.04,3,support,0.001,0.106421,0.052556,0.985016,87,6,1
5,0.03,2,lift,1.0,0.106421,0.048633,0.985016,172,8,6
3,0.03,2,confidence,0.4,0.106421,0.049227,0.985016,172,30,4
2,0.03,2,lift,1.0,0.106421,0.048633,0.985016,172,34,3


In [31]:
#Since both support and confidence metrics are less strong than a lift we will discard theirs results
lift_statistics = statistics[statistics['rule_metric'] == MetricType.LIFT]
lift_statistics

Unnamed: 0,apr_min_support,apr_max_len,rule_metric,rule_min_threshold,max_support_A,min_support_C,max_zhang,freq_ds,rules,configuration_id
5,0.03,2,lift,1.0,0.106421,0.048633,0.985016,172,8,6
2,0.03,2,lift,1.0,0.106421,0.048633,0.985016,172,34,3


In [32]:
#Selecting best result from the lift results
best_result = lift_statistics.head(1)
best_result


Unnamed: 0,apr_min_support,apr_max_len,rule_metric,rule_min_threshold,max_support_A,min_support_C,max_zhang,freq_ds,rules,configuration_id
5,0.03,2,lift,1.0,0.106421,0.048633,0.985016,172,8,6


In [33]:
#Printing the best result rule set
best_result_index = best_result['configuration_id'].values[0]
results.items[best_result_index - 1].rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhang
0,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.050535,0.054459,0.036504,0.722353,13.264166,0.033752,3.40555,0.973821
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.054459,0.050535,0.036504,0.670306,13.264166,0.033752,2.879834,0.977862
7,(GARDENERS KNEELING PAD KEEP CALM),(GARDENERS KNEELING PAD CUP OF TEA),0.068609,0.057194,0.040071,0.584055,10.211864,0.036147,2.266663,0.968524
8,(HAND WARMER OWL DESIGN),(HAND WARMER BIRD DESIGN),0.06302,0.049227,0.030559,0.484906,9.850378,0.027457,1.845823,0.958912
11,(HAND WARMER OWL DESIGN),(HAND WARMER SCOTTY DOG DESIGN),0.06302,0.053864,0.033294,0.528302,9.807989,0.029899,2.005807,0.958444
14,(JUMBO BAG 50'S CHRISTMAS),(JUMBO BAG VINTAGE CHRISTMAS),0.073365,0.051486,0.035077,0.47812,9.286348,0.0313,1.817494,0.962963
32,(WOODEN STAR CHRISTMAS SCANDINAVIAN),(WOODEN HEART CHRISTMAS SCANDINAVIAN),0.052556,0.052556,0.041379,0.78733,14.980652,0.038617,4.455,0.985016
33,(WOODEN HEART CHRISTMAS SCANDINAVIAN),(WOODEN STAR CHRISTMAS SCANDINAVIAN),0.052556,0.052556,0.041379,0.78733,14.980652,0.038617,4.455,0.985016


In [34]:
best_configuration = configuration.get_option(best_result_index)
best_configuration.info()

Configuration Info:

configuration_id = 6
min_support = 0.03
max_len = 2
association_rule_metric = lift
association_rule_min_threshold = 1.0
filter_antecedent_support = 0.05
filter_consequent_support = 0.06
filter_zhang = 0.95
