In [1]:
import pandas as pd

df = pd.read_csv('datasets/POS_TRANSACTIONS.csv')

In [2]:
#df.info()
#df.head(10)

In [3]:
df2 = df.groupby(['Transaction_Id','Product_Name'], as_index=False).agg('sum')
df2.head()

Unnamed: 0,Transaction_Id,Product_Name,Location,Quantity
0,12359,Yoghurt,2,1
1,12362,Jam,18,2
2,12365,Shampoo,5,1
3,12371,Bread,2,1
4,12380,Egg,6,1


In [4]:
df2.drop(columns=['Location'], inplace=True)

In [5]:
transactions = df2.groupby(['Transaction_Id'])['Product_Name'].apply(list)

In [6]:
transactions.get_values()

array([list(['Yoghurt']), list(['Jam']), list(['Shampoo']), ...,
       list(['Yoghurt']), list(['Egg', 'Shampoo', 'Yoghurt']),
       list(['Cereal'])], dtype=object)

In [7]:
from apyori import apriori

#type cast transacts from pd into reg list and run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support=0.05))

print(results[:5])

[RelationRecord(items=frozenset({'Bread'}), support=0.054645, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Bread'}), confidence=0.054645, lift=1.0)]), RelationRecord(items=frozenset({'Butter'}), support=0.06735, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Butter'}), confidence=0.06735, lift=1.0)]), RelationRecord(items=frozenset({'Cereal'}), support=0.05848, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Cereal'}), confidence=0.05848, lift=1.0)]), RelationRecord(items=frozenset({'Conditioner'}), support=0.08996, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Conditioner'}), confidence=0.08996, lift=1.0)]), RelationRecord(items=frozenset({'Cordial'}), support=0.134925, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Cordial'}), confidence=0.134925, lift=1.0)])]


In [8]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift']) 

result_df = convert_apriori_results_to_pandas_df(results)

print(result_df.head(20))

   Left_side     Right_side   Support  Confidence  Lift
0                     Bread  0.054645    0.054645   1.0
1                    Butter  0.067350    0.067350   1.0
2                    Cereal  0.058480    0.058480   1.0
3               Conditioner  0.089960    0.089960   1.0
4                   Cordial  0.134925    0.134925   1.0
5                       Egg  0.146885    0.146885   1.0
6             Peanut butter  0.143575    0.143575   1.0
7                   Shampoo  0.160425    0.160425   1.0
8                     Sugar  0.050990    0.050990   1.0
9                       Tea  0.241305    0.241305   1.0
10                  Yoghurt  0.171005    0.171005   1.0


In [9]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
result_df.head(1000)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
0,,Bread,0.054645,0.054645,1.0
1,,Butter,0.06735,0.06735,1.0
2,,Cereal,0.05848,0.05848,1.0
3,,Conditioner,0.08996,0.08996,1.0
4,,Cordial,0.134925,0.134925,1.0
5,,Egg,0.146885,0.146885,1.0
6,,Peanut butter,0.143575,0.143575,1.0
7,,Shampoo,0.160425,0.160425,1.0
8,,Sugar,0.05099,0.05099,1.0
9,,Tea,0.241305,0.241305,1.0


In [12]:
# filter out bread, use parrallel processing for speedup
from multiprocessing import Pool
import multiprocessing as mp
bread_transactions = transactions
def process(series):
    for index, value in series.items():
        if 'Bread' not in value:
            series.drop(index=[index], inplace=True)
    return series

def multi(series):
    num_cores = mp.cpu_count() - 1
    chunks =[series[i::num_cores] for i in range(num_cores)]
    pool = Pool(processes=num_cores)
    result = pool.map(process, chunks)
    
    return pd.concat(result)

transactions = multi(bread_transactions)

In [None]:
#type cast transacts from pd into reg list and run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support=0.05))

print(results[:5])



In [None]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift']) 

result_df = convert_apriori_results_to_pandas_df(results)

print(result_df.head(20))

In [None]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
result_df.head(1000)