In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import networkx as nx
import warnings
from itertools import permutations

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
class Columns:
    INVOICE_NO = 'invoice_no'
    STOCK_CODE = 'stock_code'
    DESCRIPTION = 'description'

class MetricType:
    SUPPORT = 'support'
    CONFIDENCE = 'confidence'
    LIFT = 'lift'

class RunResults(list):
    
    def __init__(self):
        super().__init__(self)
        self.statistics = Statistics()
        self.items = []
            
    
class SingleRunResults:
    
    def __init__(self):
        self.frequent_itemsets = None
        self.rules = None

class StatisticRecord:
    
    def __init__(self):
        self.apriori_min_support = -1
        self.apriori_max_len = -1
        self.rule_metric = ''
        self.rule_min_threshold = -1
        self.max_antecedent_support  = -1
        self.min_consequent_support = -1
        self.zhang = -1
        self.frequent_datasets = -1
        self.rules = -1
        self.configuration_id = ''
        
class Statistics:
    
    _column_names = ['apr_min_support', 'apr_max_len', 'rule_metric', \
                     'rule_min_threshold', 'max_support_A', \
                     'min_support_C', 'max_zhang', 'freq_ds', 'rules', 'configuration_id']
    
    def __init__(self):
        
        self._df = pd.DataFrame(columns=Statistics._column_names)
        self._df.set_index('configuration_id')
    
    def add_statistic(self, record):
        
        try:
            values = [record.apriori_min_support, record.apriori_max_len, record.rule_metric, \
                  record.rule_min_threshold, record.max_antecedent_support, \
                  record.min_consequent_support, record.zhang, record.frequent_datasets, \
                  record.rules, record.configuration_id]
        
            self._df.loc[len(self._df.index)] = values
        except Exception as ex:
            print(f'Unable to add a statistic {values}')
            print(f'Error args are {ex.args}\n')
    
    def get(self):
        return self._df
    
class Apriori:
    
    def __init__(self, configuration, df):
        self.configuration = configuration
        self.df = df
       
    def zhangs_rule(self, rules):
        PAB = rules['support'].copy()
        PA = rules['antecedent support'].copy()
        PB = rules['consequent support'].copy()
        NUMERATOR = PAB - PA*PB
        DENOMINATOR = np.max((PAB*(1-PA).values,PA*(PB-PAB).values), axis = 0)
        return NUMERATOR / DENOMINATOR
    
    def run(self):
        results = RunResults() 
        
        for option in self.configuration.options:
            run_results = SingleRunResults()
            
            apriori_min_support = option.min_support
            apriori_max_len = option.max_len
            
            run_results.frequent_itemsets = apriori(self.df, \
                min_support = apriori_min_support, \
                max_len = apriori_max_len, use_colnames = True)
            
            rule_metric = option.association_rule.metric
            rule_min_threshold = option.association_rule.min_threshold
            
            rules =  association_rules( \
                run_results.frequent_itemsets, \
                metric = rule_metric, \
                min_threshold = rule_min_threshold)
        
            max_antecedent_support = rules['antecedent support'].max()
            min_consequent_support = rules['consequent support'].min()
            rules['zhang'] = self.zhangs_rule(rules)
        
            if not (option.filter is None):
                
                if not (option.filter.antecedent_support is None):
                    rules = rules[rules['antecedent support'] > option.filter.antecedent_support]
                    
                if not (option.filter.consequent_support is None):
                    rules = rules[rules['consequent support'] <  option.filter.consequent_support]
                
                if not (option.filter.zhang is None):
                    rules = rules[rules['zhang'] > option.filter.zhang]
                
            run_results.rules = rules
            results.items.append(run_results)
            
            statistic = StatisticRecord()
            statistic.apriori_min_support = apriori_min_support
            statistic.apriori_max_len = apriori_max_len
            statistic.rule_metric = rule_metric
            statistic.rule_min_threshold = rule_min_threshold
            statistic.max_antecedent_support  = max_antecedent_support
            statistic.min_consequent_support = min_consequent_support
            statistic.zhang = rules['zhang'].max()
            statistic.frequent_datasets = len(run_results.frequent_itemsets)
            statistic.rules = len(run_results.rules)
            statistic.configuration_id = option.id
            
            results.statistics.add_statistic(statistic)
            
        return results
        
class AprioriConfiguration:
    
    def __init__(self):
        self.options = []
        
    def get_option(self, option_id):
        
        for o in self.options:
            if o.id == option_id:
                return o
        return None
    
class AprioriOption:
    _id_counter = 0
    
    def __init__(self, min_support, max_len):
        self.min_support = min_support
        self.max_len = max_len
        self.association_rule = None
        self.filter = None
        self.id = AprioriOption.next_id()
    
    def next_id():
        next_id = AprioriOption._id_counter + 1
        AprioriOption._id_counter = next_id
        return next_id
    
    def info(self):
        print("Configuration Info\n")
        print(f"id = {self.id}\n")
        print(f"min_support = {self.min_support}\n")
        print(f"max_len = {self.max_len}\n")
        
        if(not (self.association_rule is None)):
            print(f"association_rule_metric = {self.association_rule.metric}\n")
            print(f"association_rule_min_threshold = {self.association_rule.min_threshold}\n")
        
         if(not (self.filter is None)):
            print(f"filter_antecedent_support = {self.filter.antecedent_support}\n")
            print(f"filter_consequent_support = {self.filter.consequent_support}\n")
            print(f"filter_zhang = {self.filter.zhang}\n")
    
class FilterOption:
    
    def __init__(self):
        self.antecedent_support = None
        self.consequent_support = None
        self.zhang = None
        
class RAOption:
    
    def __init__(self, metric, min_threshold):
        self.metric = metric
        self.min_threshold = min_threshold
    
        
class Aggregate:
    
    def __init__(self, df):
        self.df = df
    
    #Select the column headers for sign items
    def apply(self, item):
        headers = []
        
        for i in self.df.columns:
            word_list = self.__word_list(str(i).lower())
            if item in word_list:
                headers.append(i)
            
            
        # Select columns for this items
        item_columns = self.df[headers]
            
        # Return category of aggregated items
        return item_columns.sum(axis = 1) >= 1.0
            
    def __word_list(self, value):
        splited = list(value.split(' '))
        return splited
    

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 172)

In [None]:
gifts_df = pd.read_csv('../datasets/market_basket/online_retail.csv')

gifts_df.rename(columns={ \
    'InvoiceNo' : 'invoice_no', \
    'StockCode' : 'stock_code', \
    'Description' : 'description' \
    }, inplace=True)

gifts_df.head()

In [None]:
gifts_df.info()

In [None]:
#Remove leading and trailing characters in the Description column
gifts_df[Columns.DESCRIPTION] = gifts_df[Columns.DESCRIPTION].str.strip()
gifts_df.head()

In [None]:
#Dropping the rows without any invoice number
row_count = len(gifts_df)

gifts_df.dropna(subset=[Columns.INVOICE_NO], inplace=True)
gifts_df[Columns.INVOICE_NO] = gifts_df[Columns.INVOICE_NO].astype('str')

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

gifts_df.dtypes

In [None]:
# Dropping all transactions which were done on credit
row_count = len(gifts_df)

filt = ~gifts_df[Columns.INVOICE_NO].str.contains('C')
gifts_df = gifts_df[filt]

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

In [None]:
print(f'Number of transactions is {len(gifts_df[Columns.INVOICE_NO].unique())}')
print(f'Number of items is {len(gifts_df[Columns.DESCRIPTION].unique())}')

In [None]:
# Recover unique InvoiceNo's.
invoice_numbers = gifts_df[Columns.INVOICE_NO].unique()
print(f'{len(invoice_numbers)} unique invoice numbers was found')

In [None]:
def extract_transactions(df):
    #Create a basket of items for each transaction
    transactions = []

    i = 1
    for invoice_no in invoice_numbers:
        if i % 500 == 0:
            print(f'{i} invoice numbers were processed')
        
        filt = gifts_df[Columns.INVOICE_NO] == invoice_no
        transaction = list(gifts_df[filt].description.astype(str))
        transactions.append(transaction)
        i = i + 1
    return transactions

In [None]:
item_map_file_path = '../datasets/outputs/market_basket/gits_item_map.csv'
is_item_map_file_exists = os.path.isfile(item_map_file_path)
is_item_map_file_exists

In [None]:
item_map_df = None

if is_item_map_file_exists:
    item_map_df = pd.read_csv(item_map_file_path)
    print('item_map_df already exists and was loaded')
else:
    print('Extracting transactions can take a few minutes\n')
    transactions = extract_transactions(gifts_df)
    
    # Instantiate transaction encoder.
    encoder = TransactionEncoder()

    # One-hot encode transactions.
    item_map = encoder.fit(transactions).transform(transactions)

    # Use unique items as column headers.
    item_map_df = pd.DataFrame(item_map, columns = encoder.columns_).drop('nan', axis=1)

    item_map_df.to_csv(index=False)

    filepath = Path(item_map_file_path)  
    filepath.parent.mkdir(parents=True, exist_ok=True)
    item_map_df.to_csv(filepath)

    print('\nStored item_map_df as a file on disk')
        
# Print onehot header.
item_map_df.head()

In [None]:
#Aggregate class examples
aggregate = Aggregate(item_map_df)

bags = aggregate.apply('bag')
boxes = aggregate.apply('box')
candles = aggregate.apply('candle')

print('Share of Bags: %.2f' % bags.mean())
print('Share of Boxes: %.2f' % boxes.mean())
print('Share of Candles: %.2f' % candles.mean())

In [None]:
item_map_df.drop(columns=['Unnamed: 0'], inplace=True)
item_map_df.head()

frequent_itemsets = apriori(item_map_df, min_support=0.05, max_len = 3, use_colnames = True)
frequent_itemsets.head()

## Configuration

In [None]:
#Setting several apriori and association rules configurations. 
#In a such way, apriori running engine can be tested

configuration = AprioriConfiguration()

option = AprioriOption(min_support=0.04, max_len=3)
option.association_rule = RAOption(metric=MetricType.SUPPORT, min_threshold=0.001)
configuration.options.append(option)

option = AprioriOption(min_support=0.05, max_len=3)
option.association_rule = RAOption(metric=MetricType.SUPPORT, min_threshold=0.002)
configuration.options.append(option)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = RAOption(metric=MetricType.LIFT, min_threshold=1.0)
configuration.options.append(option)

filter_option = FilterOption()
filter_option.antecedent_support = 0.05
filter_option.consequent_support = 0.05
ar_option = RAOption(metric=MetricType.CONFIDENCE, min_threshold=0.4)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = ar_option
configuration.options.append(option)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = ar_option
option.filter = filter_option
configuration.options.append(option)

#Setting a configuration for the best run
#Including filtering and a lift metric as a initial metric
filter_option = FilterOption()
filter_option.zhang = 0.95
filter_option.antecedent_support = 0.05
filter_option.consequent_support = 0.06
ar_option = RAOption(metric=MetricType.LIFT, min_threshold=1.00)

option = AprioriOption(min_support=0.03, max_len=2)
option.association_rule = ar_option
option.filter = filter_option
configuration.options.append(option)

## Algorithm run

In [None]:
apriori_alg = Apriori(configuration, item_map_df)
results = apriori_alg.run()

statistics = results.statistics.get().sort_values(by=['rules'])
statistics

In [None]:
#Since both support and confidence metrics are less strong than a lift we will discard theirs results
lift_statistics = statistics[statistics['rule_metric'] == MetricType.LIFT]
lift_statistics

In [None]:
#Selecting best result from the lift results
best_result = lift_statistics.head(1)
best_result


In [None]:
#Printing the best result rule set
best_result_index = best_result.index.values[0]
results.items[best_result_index].rules

In [None]:
best_configuration = configuration.get_option(best_result_index)
best_configuration.info()