In [1]:
import pandas as pd
import numpy as np
from itertools import permutations, combinations, chain
from functools import reduce

In [2]:
dataset = pd.read_csv('Market_Basket_Optimisation.csv',header=None)
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [3]:
dataset_length = len(dataset)
min_support = 0.003
min_confidence = 0.2
min_lift = 3
min_length = 2
max_length = 4
top = 20

In [4]:
#Cleaning Dataset and converting to a numpy 2D array:
dataset_values = np.array(dataset.values.astype(str))[:dataset_length,:]
dataset_values = np.char.strip(dataset_values)
dataset_values.shape

(7501, 20)

In [5]:
# Create the Frequency Table of individual values
items, count = np.unique(dataset_values, return_counts = True)
df_support = pd.DataFrame(zip(items,count), columns =['Items', 'Count'])
df_support.drop(df_support[df_support.Items == 'nan'].index, inplace = True)
df_support = df_support[df_support.Count >= min_support*dataset_length]
df_support

Unnamed: 0,Items,Count
0,almonds,153
1,antioxydant juice,67
2,asparagus,36
3,avocado,250
4,babies food,34
...,...,...
115,whole wheat pasta,221
116,whole wheat rice,439
117,yams,86
118,yogurt cake,205


In [7]:
# Create Sparse Table 
df_list = []
for row in dataset_values:
    df_row = dict.fromkeys(df_support.Items,0)
    for item in df_support.Items:
        if item in row:
            df_row[item] = np.count_nonzero(row == item)
    df_list.append(df_row)
df_binary = pd.DataFrame.from_dict(df_list)
df_binary # This matrix will help in finding out the frequency of the overlapping items in the basket

Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,...,toothpaste,turkey,vegetables mix,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,1,1,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
if min_length < 2 : min_length = 2
if max_length < 2 : max_length = 2
if max_length > len(items): max_length = len(items)
    
# Dictionary consisting of the dataframes of a specific number of items  and their frequency 
dictionary_list = [df_support]
nonchosen = []

# it consist of the list of unique items which include only those values
# which were selected previously on the basis of support 
new_selection_list = df_support['Items'].values

# Loop to create Support of the value range provided
for i in range(min_length,max_length+1):
    my_combination = np.array(list(combinations(np.unique(np.array(new_selection_list)),i)))
    print(my_combination.shape)
    list_of_new_frames = []
    new_selection_list = []
    for pairs in my_combination:
        listt = []
        sumval = 0
        for col in pairs:
            listt.append(df_binary[col])
        sumval = sum(reduce(lambda x,y: x & y, listt))
        if sumval >= min_support*dataset_length:
            list_of_new_frames.append({'Items': ', '.join(pairs), 'Count': sumval})
            new_selection_list.extend(pairs)
        else:
            nonchosen.append(''.join(pairs))
    dictionary_list.append(pd.DataFrame.from_dict(list_of_new_frames))
dictionary_list

(6555, 2)
(88560, 3)
(101270, 4)


[                 Items  Count
 0              almonds    153
 1    antioxydant juice     67
 2            asparagus     36
 3              avocado    250
 4          babies food     34
 ..                 ...    ...
 115  whole wheat pasta    221
 116   whole wheat rice    439
 117               yams     86
 118        yogurt cake    205
 119           zucchini     71
 
 [115 rows x 2 columns],                           Items  Count
 0              almonds, burgers     39
 1                 almonds, cake     23
 2            almonds, chocolate     45
 3                 almonds, eggs     49
 4         almonds, french fries     33
 ..                          ...    ...
 781        tomato juice, turkey     24
 782            tomatoes, turkey     49
 783    tomatoes, vegetables mix     24
 784  tomatoes, whole wheat rice     45
 785    turkey, whole wheat rice     53
 
 [786 rows x 2 columns],                                      Items  Count
 0        avocado, chocolate, mineral water  

In [9]:
# merge all the Items and its Count
df_support_final = pd.concat(dictionary_list)
df_support_final.sort_values(by=['Count'], ascending = False, kind='quicksort', inplace=True)
df_support_final

Unnamed: 0,Items,Count
71,mineral water,1788
36,eggs,1348
100,spaghetti,1306
42,french fries,1282
24,chocolate,1230
...,...,...
91,"cereals, ground beef, spaghetti",23
94,"chicken, chocolate, eggs",23
108,"chicken, frozen vegetables, spaghetti",23
116,"chicken, milk, pancakes",23


In [10]:
items = df_support_final.Items.values
counts = df_support_final.Count.values

confidence = 0
lift = 0
confidence_list = [] # List to create the dataframe


def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1,len(s)))

# calculating the confidence and filtering dataset
for item,count in zip(items,counts):
    # list of the subset of the given itemset
    allsubsets = list(', '.join(iten) for iten in powerset(item.split(', ')))
    
    for subset in allsubsets:
        confidence = count/int(df_support_final[df_support_final.Items == subset]['Count']) 
        if confidence >= min_confidence:
            secondset = item.split(', ')
            for i in subset.split(', '):
                secondset.pop(secondset.index(i))

            if min_lift == 0:
                confidence_list.append({"Rules":(subset+' => '+', '.join(secondset)),
                                        'Support':count/dataset_length,
                                        'Confidence': confidence})
            else:
                lift = confidence*dataset_length/int(
                    df_support_final[df_support_final.Items == ', '.join(secondset)]['Count'])
                
                if lift >=min_lift:
                    confidence_list.append({"Rules":(subset+' => '+', '.join(secondset)),
                                            'Support':count/dataset_length,
                                            'Confidence': confidence, 'Lift': lift})

df_rules = pd.DataFrame.from_dict(confidence_list) 
if len(confidence_list) > 1:
    df_rules.sort_values(by='Confidence' ,ascending=False, inplace = True)
    if min_lift != 0:
        df_rules.sort_values(by='Lift' ,ascending=False, inplace = True)
    df_rules.reset_index(drop=True, inplace = True)
df_rules  

Unnamed: 0,Rules,Support,Confidence,Lift
0,"frozen vegetables, soup => milk, mineral water",0.003066,0.383333,7.987176
1,"frozen vegetables, olive oil => milk, mineral ...",0.003333,0.294118,6.128268
2,"mineral water, whole wheat pasta => olive oil",0.003866,0.402778,6.115863
3,"milk, soup => frozen vegetables, mineral water",0.003066,0.201754,5.646864
4,"tomato sauce => ground beef, spaghetti",0.003066,0.216981,5.535971
...,...,...,...,...
125,"chocolate, eggs, mineral water => ground beef",0.003999,0.297030,3.023093
126,"milk, mineral water, spaghetti => frozen veget...",0.004533,0.288136,3.022804
127,"frozen vegetables, spaghetti => shrimp",0.005999,0.215311,3.013149
128,"ground beef, shrimp => spaghetti",0.005999,0.523256,3.005315
