# 

## Task 1 - association mining 

In [1]:
import pandas as pd
from apyori import apriori

In [2]:
data = pd.read_csv('data/D1.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131706 entries, 0 to 131705
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Date          131706 non-null  object 
 1   Customer_ID   131706 non-null  int64  
 2   Sales_ID      131706 non-null  int64  
 3   SKU_Category  131706 non-null  object 
 4   SKU           131706 non-null  object 
 5   Quantity      131706 non-null  float64
 6   Sales_Amount  131706 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 7.0+ MB


In [3]:
# convert all columns to lower case
data.columns = data.columns.str.lower()

In [4]:
data

Unnamed: 0,date,customer_id,sales_id,sku_category,sku,quantity,sales_amount
0,2/01/2016,2547,1,X52,0EM7L,1.0,3.13
1,2/01/2016,822,2,2ML,68BRQ,1.0,5.46
2,2/01/2016,3686,3,0H2,CZUZX,1.0,6.35
3,2/01/2016,3719,4,0H2,549KK,1.0,5.59
4,2/01/2016,9200,5,0H2,K8EHH,1.0,6.88
...,...,...,...,...,...,...,...
131701,4/07/2016,20203,32900,IEV,FO112,3.0,6.46
131702,4/07/2016,20203,32900,N8U,I36F2,1.0,4.50
131703,4/07/2016,20203,32900,U5F,4X8P4,1.0,5.19
131704,4/07/2016,20203,32900,0H2,ZVTO4,1.0,4.57


In [5]:
# convert Date to a date object
data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y')

In [6]:
# Convert Sales_ID, and Customer_ID to a string type
data['sales_id'] = data['sales_id'].astype(str)
data['customer_id'] = data['customer_id'].astype(str)

In [7]:
data.head()

Unnamed: 0,date,customer_id,sales_id,sku_category,sku,quantity,sales_amount
0,2016-01-02,2547,1,X52,0EM7L,1.0,3.13
1,2016-01-02,822,2,2ML,68BRQ,1.0,5.46
2,2016-01-02,3686,3,0H2,CZUZX,1.0,6.35
3,2016-01-02,3719,4,0H2,549KK,1.0,5.59
4,2016-01-02,9200,5,0H2,K8EHH,1.0,6.88


In [8]:
# Define the Transactional Data
transactions = data.groupby('sales_id')['sku_category'].apply(list)
transactions_list = list(transactions)

In [9]:
data.groupby(['customer_id','date'])['sales_id'].count()

customer_id  date      
1            2016-01-22    2
10           2016-02-12    1
100          2016-01-03    2
1000         2016-02-15    1
10000        2016-05-11    2
                          ..
9996         2016-09-18    2
             2016-10-16    3
9997         2016-09-02    1
9998         2016-07-13    2
9999         2016-05-06    3
Name: sales_id, Length: 62727, dtype: int64

In [10]:
1 / len(data['sku_category'].value_counts())

0.0053475935828877

In [11]:
results = list(apriori(transactions_list, min_support=0.005, min_confidence=0.4))

In [12]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = Left side of rules, items_add = Right side
            # support, confidence and lift for respoective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                          rule_set.support, rule.confidence, rule.lift])
            
            # Typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_rule', 'Right_rule', 'Support', 'Confidence', 'Lift'])


In [13]:
results_df = convert_apriori_results_to_pandas_df(results)

In [14]:
results_df.sort_values(by='Lift', ascending=False)

Unnamed: 0,Left_rule,Right_rule,Support,Confidence,Lift
28,"N8U,LPF",OXH,0.006973,0.413761,9.91954
29,"OXH,N8U",LPF,0.006973,0.626389,8.715011
22,"N8U,LPF",IEV,0.009338,0.554128,8.697436
23,"OXH,IEV",LPF,0.01093,0.619089,8.613451
27,"OXH,N8U",IEV,0.006076,0.545833,8.567239
24,"OXH,LPF",IEV,0.01093,0.544684,8.549201
20,"FU5,OXH",LPF,0.007313,0.610323,8.491479
21,"N8U,IEV",LPF,0.009338,0.609485,8.479831
15,"01F,LPF",IEV,0.006447,0.528517,8.295449
26,"U5F,LPF",IEV,0.005581,0.517934,8.129339


### Part 3 - top 5 for 01F specifc categories

In [15]:
results_01F = list(apriori(transactions_list, min_support=0.005, min_confidence=0.25))

In [16]:
results_df_01F = convert_apriori_results_to_pandas_df(results_01F)

In [17]:
# filter results_df for rules with 01F in the left_rule or the right rule
filtered_results = results_df_01F[results_df_01F['Left_rule'].str.contains('01F') | results_df_01F['Right_rule'].str.contains('01F')]

In [18]:
filtered_results.sort_values(by='Support', ascending=False)

Unnamed: 0,Left_rule,Right_rule,Support,Confidence,Lift
1,01F,IEV,0.012909,0.481268,7.553841
2,01F,LPF,0.012198,0.454755,6.327052
4,01F,OXH,0.008163,0.304323,7.295851
0,01F,FU5,0.007498,0.279539,6.689284
3,01F,N8U,0.007421,0.276657,1.806089
36,"01F,IEV",LPF,0.006447,0.499401,6.948219
37,"01F,LPF",IEV,0.006447,0.528517,8.295449


#### Part 4 - Association Rules using SPMF

In [30]:
data = data.sort_values(by=['customer_id', 'date'], ascending=True)

In [21]:
transactions= data.groupby('customer_id')['sku_category'].apply(list)

In [29]:
transactions.head()

customer_id
1                                           [0H2, N8U]
10                                               [SJS]
100                                         [FEW, MU3]
1000                                             [LSD]
10000                                  [1VL, J4R, P42]
                             ...                      
9995          [Q4N, N5F, JPI, 29A, Q4N, P42, 1TS, JKC]
9996     [F9B, H8O, I4Y, N8U, NVL, LGI, XG4, A0G, P42]
9997                                             [FZT]
9998                                        [N8U, 29A]
9999                                   [A38, U5F, H8O]
Name: sku_category, Length: 22625, dtype: object

In [22]:
sequences = transactions.values.tolist()

In [23]:
sequences[0:5]

[['0H2', 'N8U'], ['SJS'], ['FEW', 'MU3'], ['LSD'], ['1VL', 'J4R', 'P42']]

In [24]:
from collections import defaultdict
import subprocess
import re
    

In [25]:

''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF
    
    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1
    
    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])
                    
                # end of itemset
                z.append(-1)
            
            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')
    
    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 
                     'seq_rule_input.txt', 'seq_rule_output.txt', 
                     supp_param, conf_param], shell=True)
    
    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
    
    # return pandas DataFrame
    return pd.DataFrame(output_rules, columns = ['Left_rule', 'Right_rule', 'Support', 'Confidence'])

In [26]:
seq_association_rules = get_association_rules(sequences, 0.01, 0.5)

In [27]:
seq_association_rules.sort_values(by='Confidence', ascending=False)

Unnamed: 0,Left_rule,Right_rule,Support,Confidence
18,[NTA],[IEV],0.014674,0.622889
16,[NTA],[LPF],0.01432,0.60788
10,"[OXH, IEV, FU5]",[LPF],0.01021,0.592308
12,"[OXH, FU5]",[LPF],0.01684,0.585253
2,"[N8U, OXH, FU5]",[LPF],0.010166,0.580808
11,"[OXH, 6BZ]",[LPF],0.011315,0.579186
8,"[OXH, 01F]",[LPF],0.010696,0.564103
1,"[N8U, OXH, IEV]",[LPF],0.014453,0.560892
0,"[N8U, OXH]",[LPF],0.02179,0.542354
3,"[N8U, IEV, FU5]",[LPF],0.010564,0.537079
