In [1]:
import pandas as pd
import ast
from apyori import apriori

### Load data to extract carts

In [2]:
file_path = '../data/tesco/tesco_carts.csv'

try:
    df = pd.read_csv(
        file_path,
        sep='\t',       
        quotechar='"',   

    )
    def parse_cart_column(cart):
        try:
            return [int(item) for item in ast.literal_eval(cart)]
        except (ValueError, SyntaxError):
            return []

    df['cart'] = df['cart'].apply(parse_cart_column)

except pd.errors.ParserError as e:
    print(f"ParserError: {e}")


In [3]:
df.head()

Unnamed: 0,cart_id,cart
0,0,[73314923]
1,1,"[58175124, 50502269, 70943424, 57346955, 56036..."
2,2,"[50962501, 50503297, 50507984, 50169663, 51965..."
3,3,"[72680530, 62284801, 58098273]"
4,4,"[56373768, 67336474, 52844621, 54921285]"


In [4]:
df['cart'].isnull().sum()

0

In [5]:
df_inventory = pd.read_csv('../data/tesco/tesco_inventory_clean.csv')
df_inventory.head()

Unnamed: 0,product_id,category,description,ingredients,energy,fat,saturates,salt,sugars,protein,carbohydrate,fibre,avg_price
0,68238698,fruit_veg,Tesco Baby Corn 190G (M),no_ingredients,28.0,0.4,0.1,0.3,1.9,2.5,2.7,2.0,1.645
1,53426251,fruit_veg,Tesco Cranberries 100G,Pineapple Juice from Concentrate Cranberries S...,336.0,1.6,0.2,0.1,65.0,0.3,77.4,5.5,1.495
2,59445495,fruit_veg,Tesco Crispy Slices 350G,Potato (78%) Batter Rapeseed Oil Batter contai...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5
3,74533881,sweets,Kelloggs Pop Tarts Frosted S'mores 416G,"Enriched Flour (Wheat Flour, Niacin, Reduced I...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
4,54198489,grains,Osem Bamba Peanut Snack 25G,Peanuts (50%) Corn Palm Oil Salt,534.0,34.0,6.6,1.0,3.2,0.0,0.0,0.0,0.4


In [6]:
len(df_inventory)

22596

In [7]:
len(df)

1582801

In [8]:
carts_list = df['cart'].tolist()
#carts_list

### Association rule mining

### Task 1: Triplets

For this task I put minimum lift 2.5 as it is required to show that the likelihood of the association is at least 2.5 times higher than chance. I suppose that items are frequently bought if they exceed being bought 3000 times (as the dataset is quite big, consisting of 1,5 milion carts and more than 22k products) so the support would be 3000/1,5 mil = 0.0018. I choose a minimum confidence of 0.3 which will give the transactions in which the consequent item is found in 30% of the carts where we have the antecedent items. I chose this percentage to capture meaningful associations but not be too strict at the same time, leaving space for new associations as well.

In [9]:
3000/len(df)

0.0018953740868245597

In [10]:
association_rules = apriori(carts_list, min_support=0.0018, 
                            min_confidence=0.3,
                            min_lift=2.5)
association_rules = list(association_rules)

In [11]:
#association_rules

In [12]:
def get_product_name(product_id):
    return df_inventory[df_inventory['product_id'] == product_id]['description'].values[0]

resultsT_df = pd.DataFrame(columns=['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift'])

for rule in association_rules:
    for ordered_statistic in rule.ordered_statistics:
        antecedent = list(ordered_statistic.items_base)
        consequent = list(ordered_statistic.items_add)
        
        if len(antecedent) == 2 and len(consequent) == 1:
            lift = ordered_statistic.lift
            if lift >= 2.5:
                antecedent_products = [get_product_name(a) for a in antecedent]
                consequent_product = get_product_name(consequent[0])
                support = rule.support

                triplet_df = pd.DataFrame({
                    'Antecedent': [', '.join(antecedent_products)],
                    'Consequent': [consequent_product],
                    'Support': [support],
                    'Confidence': [ordered_statistic.confidence],
                    'Lift': [lift]
                })

                resultsT_df = pd.concat([resultsT_df, triplet_df], ignore_index=True)

In [13]:
resultsT_df

Unnamed: 0,Antecedent,Consequent,Support,Confidence,Lift
0,"Tesco White Potatoes 2.5Kg, Tesco Bananas Loose",Tesco Whole Cucumber Each,0.003285,0.337399,6.192082
1,"Tesco White Potatoes 2.5Kg, Tesco Whole Cucumb...",Tesco Bananas Loose,0.003285,0.38317,4.320964
2,"Tesco Cauliflower Each, Tesco Bananas Loose",Tesco Whole Cucumber Each,0.002051,0.328476,6.028317
3,"Tesco Cauliflower Each, Tesco Whole Cucumber Each",Tesco Bananas Loose,0.002051,0.407022,4.58994
4,"Tesco Bananas Loose, Tesco Iceberg Lettuce Each",Tesco Whole Cucumber Each,0.003808,0.538022,9.873987
5,"Tesco Whole Cucumber Each, Tesco Iceberg Lettu...",Tesco Bananas Loose,0.003808,0.330084,3.722323
6,"Tesco Mixed Peppers 3 Pack, Tesco Bananas Loose",Tesco Whole Cucumber Each,0.003093,0.439644,8.06852
7,"Tesco Mixed Peppers 3 Pack, Tesco Whole Cucumb...",Tesco Bananas Loose,0.003093,0.359477,4.053782
8,Tesco British Semi Skimmed Milk 2.272Ltr 4 Pin...,Tesco Bananas Loose,0.002872,0.410215,4.625945
9,"Tesco Broccoli 335G, Tesco Bananas Loose",Tesco Whole Cucumber Each,0.002813,0.330979,6.074254


In [24]:
resultsT_df = resultsT_df.sort_values(by='Lift', ascending=False)
resultsT_df

Unnamed: 0,Antecedent,Consequent,Support,Confidence,Lift
20,"Tesco Mixed Peppers 3 Pack, Tesco Iceberg Lett...",Tesco Whole Cucumber Each,0.002361,0.625754,11.484066
22,"Tesco Iceberg Lettuce Each, Tesco Cherry Tomat...",Tesco Whole Cucumber Each,0.00181,0.622015,11.415451
18,"Tesco White Potatoes 2.5Kg, Tesco Iceberg Lett...",Tesco Whole Cucumber Each,0.002493,0.577238,10.593694
21,Tesco British Semi Skimmed Milk 2.272Ltr 4 Pin...,Tesco Whole Cucumber Each,0.001817,0.554998,10.185535
4,"Tesco Bananas Loose, Tesco Iceberg Lettuce Each",Tesco Whole Cucumber Each,0.003808,0.538022,9.873987
19,"Tesco White Potatoes 2.5Kg, Tesco Mixed Pepper...",Tesco Whole Cucumber Each,0.001874,0.478619,8.783792
6,"Tesco Mixed Peppers 3 Pack, Tesco Bananas Loose",Tesco Whole Cucumber Each,0.003093,0.439644,8.06852
13,"Tesco Bananas Loose, Tesco Cherry Tomatoes 330G",Tesco Whole Cucumber Each,0.002492,0.434361,7.971562
11,"Tesco Bunched Spring Onions 100G, Tesco Banana...",Tesco Whole Cucumber Each,0.002053,0.432756,7.942108
0,"Tesco White Potatoes 2.5Kg, Tesco Bananas Loose",Tesco Whole Cucumber Each,0.003285,0.337399,6.192082


### Task 2: Pairs

As the exercise requires the pairs to be present in 0.7% of the carts I put the support to 0.007. I left the min confidence to 0.3 and to make a strong association of the consequent item i put the lift to 3 so the chances of the consequent item to be bought after the antecedent item to be 3 times higher than random (higher than having no association at all).

In [14]:
association_rules2 = apriori(carts_list, min_support=0.007, 
                            min_confidence=0.3,
                            min_lift=3)
association_rules2 = list(association_rules2)

In [15]:
#association_rules2

In [16]:
results_list = []

for rule in association_rules2:
    
    ordered_statistic = rule.ordered_statistics[0]
    
    antecedent = list(ordered_statistic.items_base)
    consequent = list(ordered_statistic.items_add)
    
    if len(antecedent) == 1 and len(consequent) == 1:
        antecedent_product = get_product_name(antecedent[0])
        consequent_product = get_product_name(consequent[0])
        
        support = rule.support
        confidence = ordered_statistic.confidence
        lift = ordered_statistic.lift

        results_list.append({
            'Antecedent': antecedent_product,
            'Consequent': consequent_product,
            'Support': support,
            'Confidence': confidence,
            'Lift': lift
        })
    else:
        print(f'The rule at index {idx} is not a pair.')

results_df = pd.DataFrame(results_list)


In [17]:
results_df = results_df.sort_values(by='Lift', ascending=False)
results_df

Unnamed: 0,Antecedent,Consequent,Support,Confidence,Lift
1,Tesco Iceberg Lettuce Each,Tesco Whole Cucumber Each,0.011538,0.465464,8.542365
2,Tesco Cherry Tomatoes 330G,Tesco Whole Cucumber Each,0.00718,0.358995,6.58842
0,Tesco Whole Cucumber Each,Tesco Bananas Loose,0.016375,0.300528,3.389015
