# Test

In [1]:
import pandas as pd
from apyori import apriori

In [2]:
# Load the data
data = pd.read_csv('data/D1.csv')

In [3]:
# Display the first few rows
data.head()

Unnamed: 0,Date,Customer_ID,Sales_ID,SKU_Category,SKU,Quantity,Sales_Amount
0,2/01/2016,2547,1,X52,0EM7L,1.0,3.13
1,2/01/2016,822,2,2ML,68BRQ,1.0,5.46
2,2/01/2016,3686,3,0H2,CZUZX,1.0,6.35
3,2/01/2016,3719,4,0H2,549KK,1.0,5.59
4,2/01/2016,9200,5,0H2,K8EHH,1.0,6.88


In [4]:
# Check for missing data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131706 entries, 0 to 131705
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Date          131706 non-null  object 
 1   Customer_ID   131706 non-null  int64  
 2   Sales_ID      131706 non-null  int64  
 3   SKU_Category  131706 non-null  object 
 4   SKU           131706 non-null  object 
 5   Quantity      131706 non-null  float64
 6   Sales_Amount  131706 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 7.0+ MB


In [5]:
# convert Date to a date object
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

In [6]:
# Convert Sales_ID, and Customer_ID to a string type
data['Sales_ID'] = data['Sales_ID'].astype(str)
data['Customer_ID'] = data['Customer_ID'].astype(str)

In [7]:
# Define the Transactional Data
transactions = data.groupby('Sales_ID')['SKU_Category'].apply(list)
transactions_list = list(transactions)

In [8]:
data.SKU_Category.value_counts()

SKU_Category
N8U    10913
R6E     5099
LPF     5062
P42     4836
U5F     4570
       ...  
M8H        3
U3N        2
QON        1
2JO        1
OTK        1
Name: count, Length: 187, dtype: int64

In [9]:
results = list(apriori(transactions_list, min_support=0.00534759))

In [10]:
def convert_apriori_results_to_pandas_df(results):
    # Initialize an empty list to store rule details
    rules_list = []
    
    # Iterate through each result in the results list
    for result in results:
        # Iterate through each ordered statistic (association rule) within the result
        for ordered_stat in result.ordered_statistics:
            # Extract the rule's antecedent (left-hand side) and consequent (right-hand side)
            antecedent = tuple(ordered_stat.items_base)
            consequent = tuple(ordered_stat.items_add)
            
            # Append the rule's details to the rules list
            rules_list.append({
                'Antecedent': ', '.join(antecedent),
                'Consequent': ', '.join(consequent),
                'Support': result.support,
                'Confidence': ordered_stat.confidence,
                'Lift': ordered_stat.lift
            })
    
    # Convert the list of rules to a pandas DataFrame
    rules_df = pd.DataFrame(rules_list)
    
    # Reorder columns to a more intuitive order (optional)
    rules_df = rules_df[['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift']]
    
    return rules_df



In [11]:
results_df = convert_apriori_results_to_pandas_df(results)

In [12]:
results_df.sort_values(by='Lift', ascending=False)

Unnamed: 0,Antecedent,Consequent,Support,Confidence,Lift
305,"LPF, N8U",OXH,0.006973,0.413761,9.919540
304,OXH,"LPF, N8U",0.006973,0.167161,9.919540
298,"IEV, N8U",OXH,0.006076,0.396569,9.507370
297,OXH,"IEV, N8U",0.006076,0.145663,9.507370
137,9ZX,FU5,0.007220,0.380293,9.100304
...,...,...,...,...,...
104,0H2,N8U,0.009230,0.143544,0.937093
233,N8U,U5F,0.007838,0.051171,0.917867
234,U5F,N8U,0.007838,0.140599,0.917867
231,R6E,N8U,0.008874,0.124620,0.813552


In [13]:
data['SKU_Category_Quantity'] = data['SKU_Category'] + '_' + data['Sales_Amount'].astype(float).astype(str)

In [14]:
data = data.sort_values(by=['Date', 'Customer_ID'])

In [15]:
data.head()

Unnamed: 0,Date,Customer_ID,Sales_ID,SKU_Category,SKU,Quantity,Sales_Amount,SKU_Category_Quantity
73,2016-01-02,1098,47,FEW,JJ9FT,1.0,5.79,FEW_5.79
74,2016-01-02,1098,47,EMC,23Y9E,3.0,15.18,EMC_15.18
75,2016-01-02,1098,47,FEW,25CJ5,1.0,10.3,FEW_10.3
8,2016-01-02,1253,8,0H2,9STQJ,1.0,8.25,0H2_8.25
33,2016-01-02,1253,23,R6E,YIM6C,1.0,8.65,R6E_8.65


In [16]:
transactions= data.groupby('Customer_ID')

In [17]:
transactions2 = transactions['SKU_Category'].apply(list)

In [18]:
sequences = transactions2.values.tolist()

In [19]:
sequences[0:5]

[['0H2', 'N8U'], ['SJS'], ['FEW', 'MU3'], ['LSD'], ['1VL', 'J4R', 'P42']]

In [20]:
from collections import defaultdict
import subprocess
import re
    

In [21]:

''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF
    
    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1
    
    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])
                    
                # end of itemset
                z.append(-1)
            
            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')
    
    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 
                     'seq_rule_input.txt', 'seq_rule_output.txt', 
                     supp_param, conf_param], shell=True)
    
    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
    
    # return pandas DataFrame
    return pd.DataFrame(output_rules, columns = ['Left_rule', 'Right_rule', 'Support', 'Confidence'])

In [22]:
get_association_rules(sequences, 0.01, 0.5)

Unnamed: 0,Left_rule,Right_rule,Support,Confidence
0,"[N8U, OXH]",[LPF],0.02179,0.542354
1,"[N8U, OXH, IEV]",[LPF],0.014453,0.560892
2,"[N8U, OXH, FU5]",[LPF],0.010166,0.580808
3,"[N8U, IEV, FU5]",[LPF],0.010564,0.537079
4,"[N8U, FU5]",[LPF],0.017282,0.506477
5,"[N8U, 01F]",[IEV],0.011536,0.509766
6,[OXH],[LPF],0.039425,0.522248
7,"[OXH, U5F]",[LPF],0.011624,0.513672
8,"[OXH, 01F]",[LPF],0.010696,0.564103
9,"[OXH, IEV]",[LPF],0.022055,0.519251
