In [266]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import networkx as nx
import warnings
from itertools import permutations

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [267]:
class Columns:
    INVOICE_NO = 'invoice_no'
    STOCK_CODE = 'stock_code'
    DESCRIPTION = 'description'

class MetricType:
    SUPPORT = 'support'
    LIFT = 'lift'

class RunResults(list):
    
    def __init__(self):
        super().__init__(self)
        self.statistics = Statistics()
        self.items = []
            
    
class SingleRunResults:
    
    def __init__(self):
        self.frequent_itemsets = None
        self.rules = None

class Statistics:
    
    def __init__(self):
        
        self._df = pd.DataFrame(columns=['apriori_min_support', 'apriori_max_len', 'rule_metric', 'rule_min_threshold', 'frequent_datasets', 'rules'])
    
    def add_statistic(self, apriori_min_support, apriori_max_len, \
        rul_metric, rul_min_threshold, frequent_datasets, rules):
        
        self._df.loc[len(self._df.index)] = [apriori_min_support, apriori_max_len, \
            rul_metric, rul_min_threshold, frequent_datasets, rules] 
    
    def get(self):
        return self._df
    
class AprioriRun:
    
    def __init__(self, configuration, df):
        self.configuration = configuration
        self.df = df
    
    def run(self):
        results = RunResults() 
        
        for option in self.configuration.options:
            run_results = SingleRunResults()
            
            apriori_min_support = option.min_support
            apriori_max_len = option.max_len
            
            run_results.frequent_itemsets = apriori(self.df, \
                min_support = apriori_min_support, \
                max_len = apriori_max_len, use_colnames = True)
            
            rule_metric = option.association_rule.metric
            rule_min_threshold = option.association_rule.min_threshold
            
            run_results.rules =  association_rules( \
                run_results.frequent_itemsets, \
                metric = rule_metric, \
                min_threshold = rule_min_threshold)
        
            results.items.append(run_results)
            
            results.statistics.add_statistic(apriori_min_support, \
                apriori_max_len, rule_metric, rule_min_threshold, \
                len(run_results.frequent_itemsets), len(run_results.rules))
            
        return results
        
class AprioriConfiguration:
    
    def __init__(self):
        self.options = []
    
class AprioriConfigurationOption:
    
    def __init__(self, min_support, max_len):
        self.min_support = min_support
        self.max_len = max_len
        self.association_rule = None

class RuleAssociationConfigurationOption:
    
    def __init__(self, metric, min_threshold):
        self.metric = metric
        self.min_threshold = min_threshold
    
        
class Aggregate:
    
    def __init__(self, df):
        self.df = df
    
    #Select the column headers for sign items
    def apply(self, item):
        headers = []
        
        for i in self.df.columns:
            word_list = self.__word_list(str(i).lower())
            if item in word_list:
                headers.append(i)
            
            
        # Select columns for this items
        item_columns = self.df[headers]
            
        # Return category of aggregated items
        return item_columns.sum(axis = 1) >= 1.0
            
    def __word_list(self, value):
        splited = list(value.split(' '))
        return splited
    

In [268]:
gifts_df = pd.read_csv('../datasets/market_basket/online_retail.csv')

gifts_df.rename(columns={ \
    'InvoiceNo' : 'invoice_no', \
    'StockCode' : 'stock_code', \
    'Description' : 'description' \
    }, inplace=True)

gifts_df.head()

Unnamed: 0,invoice_no,stock_code,description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)


In [269]:
gifts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227760 entries, 0 to 227759
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   invoice_no   227760 non-null  object
 1   stock_code   227760 non-null  object
 2   description  227404 non-null  object
dtypes: object(3)
memory usage: 5.2+ MB


In [270]:
#Remove leading and trailing characters in the Description column
gifts_df[Columns.DESCRIPTION] = gifts_df[Columns.DESCRIPTION].str.strip()
gifts_df.head()

Unnamed: 0,invoice_no,stock_code,description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)


In [271]:
#Dropping the rows without any invoice number
row_count = len(gifts_df)

gifts_df.dropna(subset=[Columns.INVOICE_NO], inplace=True)
gifts_df[Columns.INVOICE_NO] = gifts_df[Columns.INVOICE_NO].astype('str')

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

gifts_df.dtypes

Row count dropped from 227760 to 227760


invoice_no     object
stock_code     object
description    object
dtype: object

In [272]:
# Dropping all transactions which were done on credit
row_count = len(gifts_df)

filt = ~gifts_df[Columns.INVOICE_NO].str.contains('C')
gifts_df = gifts_df[filt]

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

Row count dropped from 227760 to 224372


In [273]:
print(f'Number of transactions is {len(gifts_df[Columns.INVOICE_NO].unique())}')
print(f'Number of items is {len(gifts_df[Columns.DESCRIPTION].unique())}')

Number of transactions is 8410
Number of items is 3447


In [274]:
# Recover unique InvoiceNo's.
invoice_numbers = gifts_df[Columns.INVOICE_NO].unique()
print(f'{len(invoice_numbers)} unique invoice numbers was found')

8410 unique invoice numbers was found


In [275]:
def extract_transactions(df):
    #Create a basket of items for each transaction
    transactions = []

    i = 1
    for invoice_no in invoice_numbers:
        if i % 500 == 0:
            print(f'{i} invoice numbers were processed')
        
        filt = gifts_df[Columns.INVOICE_NO] == invoice_no
        transaction = list(gifts_df[filt].description.astype(str))
        transactions.append(transaction)
        i = i + 1
    return transactions

In [276]:
item_map_file_path = '../datasets/outputs/market_basket/gits_item_map.csv'
is_item_map_file_exists = os.path.isfile(item_map_file_path)
is_item_map_file_exists

True

In [277]:
item_map_df = None

if is_item_map_file_exists:
    item_map_df = pd.read_csv(item_map_file_path)
    print('item_map_df already exists and was loaded')
else:
    print('Extracting transactions can take a few minutes\n')
    transactions = extract_transactions(gifts_df)
    
    # Instantiate transaction encoder.
    encoder = TransactionEncoder()

    # One-hot encode transactions.
    item_map = encoder.fit(transactions).transform(transactions)

    # Use unique items as column headers.
    item_map_df = pd.DataFrame(item_map, columns = encoder.columns_).drop('nan', axis=1)

    item_map_df.to_csv(index=False)

    filepath = Path(item_map_file_path)  
    filepath.parent.mkdir(parents=True, exist_ok=True)
    item_map_df.to_csv(filepath)

    print('\nStored item_map_df as a file on disk')
        
# Print onehot header.
item_map_df.head()

item_map_df already exists and was loaded


Unnamed: 0.1,Unnamed: 0,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,...,wet boxes,wet pallet,wet rusty,wet?,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [278]:
#Aggregate class examples
aggregate = Aggregate(item_map_df)

bags = aggregate.apply('bag')
boxes = aggregate.apply('box')
candles = aggregate.apply('candle')

print('Share of Bags: %.2f' % bags.mean())
print('Share of Boxes: %.2f' % boxes.mean())
print('Share of Candles: %.2f' % candles.mean())

Share of Bags: 0.41
Share of Boxes: 0.39
Share of Candles: 0.11


In [279]:
item_map_df.drop(columns=['Unnamed: 0'], inplace=True)
item_map_df.head()

frequent_itemsets = apriori(item_map_df, min_support=0.05, max_len = 3, use_colnames = True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.054697,(60 CAKE CASES VINTAGE CHRISTMAS)
1,0.054459,(ALARM CLOCK BAKELIKE GREEN)
2,0.050535,(ALARM CLOCK BAKELIKE RED)
3,0.069203,(ASSORTED COLOUR BIRD ORNAMENT)
4,0.053983,(BAKING SET 9 PIECE RETROSPOT)


## Configuration

In [280]:
configuration = AprioriConfiguration()

option = AprioriConfigurationOption(min_support=0.04, max_len=3)
option.association_rule = RuleAssociationConfigurationOption(metric=MetricType.SUPPORT, min_threshold=0.001)
configuration.options.append(option)

option = AprioriConfigurationOption(min_support=0.05, max_len=3)
option.association_rule = RuleAssociationConfigurationOption(metric=MetricType.SUPPORT, min_threshold=0.002)
configuration.options.append(option)

option = ApprioriConfigurationOption(min_support=0.03, max_len=2)
option.association_rule = RuleAssociationConfigurationOption(metric=MetricType.LIFT, min_threshold=1.0)
configuration.options.append(option)

## Algorithm run

In [281]:
apriori_run = AprioriRun(configuration, item_map_df)
results = apriori_run.run()

statistics = results.statistics.get()
statistics.head()

Unnamed: 0,apriori_min_support,apriori_max_len,rule_metric,rule_min_threshold,frequent_datasets,rules
0,0.04,3,support,0.001,87,6
1,0.05,3,support,0.002,50,2
2,0.03,2,lift,1.0,172,34
