# Pattern Analysis
https://web.ist.utl.pt/rmch/dash/guides/AssociationRuleMining%20in%20Python.html

Association Pattern Analysis

- What are the aisles most purchased together?
- Which aisles are most reordered together?
- Which aisles are most ordered isolately?

ARM Resources
- https://www.kaggle.com/code/datatheque/association-rules-mining-market-basket-analysis/notebook
- https://comum.rcaap.pt/bitstream/10400.26/37552/1/Tese_JoanaOliveira.pdf

# File and libraries

In [1]:
import pandas as pd
import numpy as np


filepath=r'data\instacart_pre_proc.csv'

file_tag = "Instacart Market Basket"


## DSLabs

In [2]:
%run "scripts/dslabs_functions.py"

# data functions

In [3]:
%run "scripts/data_functions.py"

data_functions lodaded


# Load

In [4]:
test_data=True
# test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data



if test_data==True:

    data=pd.read_csv(filepath)

    # Apply the sampling to each group 0.5%
    sample=0.01
    data = sample_user_orders(data, fraction=sample)


else:
    data=pd.read_csv(filepath)

    # Apply the sampling to each group 1%
    sample=0.05
    data = sample_user_orders(data, fraction=sample)



data=enrich_instacart_df(data)



data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31950 entries, 0 to 31949
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   order_id                 31950 non-null  int64  
 1   product_id               31950 non-null  int64  
 2   add_to_cart_order        31950 non-null  int64  
 3   reordered                31950 non-null  int64  
 4   user_id                  31950 non-null  int64  
 5   order_number             31950 non-null  int64  
 6   order_dow                31950 non-null  int64  
 7   order_hour_of_day        31950 non-null  int64  
 8   days_since_prior_order   29887 non-null  float64
 9   order_time_of_day        31950 non-null  object 
 10  is_weekend               31950 non-null  int64  
 11  weeks_since_prior_order  29887 non-null  float64
 12  order_time_of_day_enc    31950 non-null  float64
 13  order_hour_of_day_sin    31950 non-null  float64
 14  order_hour_of_day_cos 

## drop nulls

- this will drop all first buying orders from the dataframe

In [5]:
# drop any null values
data=data.dropna()

# aisle Order ID Mining
- group order ids and dummify aisle

In [6]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


aisle_df = data.pivot_table(index='order_id', columns='aisle', values='product_id', aggfunc='count', fill_value=0)
aisle_df = (aisle_df > 0).astype(int)
aisle_df


aisle,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1869,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
5345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3417091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3417095,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3419500,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3419974,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Top Aisles Association Pattern Mining

In [7]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

# Generate frequent itemsets using fpgrowth
frequent_itemsets = fpgrowth(aisle_df, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Display the main metrics
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']]



Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
0,(fresh vegetables),(fresh fruits),0.320598,0.719612,1.286243,0.071346,1.571151
1,(fresh fruits),(fresh vegetables),0.320598,0.573040,1.286243,0.071346,1.298683
2,(fresh vegetables),(refrigerated),0.064120,0.143922,1.099509,0.005803,1.015215
3,(refrigerated),(fresh vegetables),0.064120,0.489848,1.099509,0.005803,1.086901
4,(refrigerated),(fresh fruits),0.083056,0.634518,1.134144,0.009824,1.205343
...,...,...,...,...,...,...,...
22219,(milk),"(frozen pizza, fresh fruits)",0.010963,0.044898,1.571429,0.003987,1.017094
22220,(frozen pizza),"(milk, fresh fruits)",0.010963,0.257812,1.583705,0.004041,1.128029
22221,(fresh fruits),"(milk, frozen pizza)",0.010963,0.019596,1.340558,0.002785,1.005078
22222,(missing),(fresh fruits),0.010963,0.660000,1.179691,0.001670,1.295681


# Top Products Orders Mining

## Dataset Prep

In [11]:
# Group by order_id and aggregate features
multi_product_order_agg = data.groupby('order_id').agg({
    'add_to_cart_order': 'max',
}).rename(columns={'add_to_cart_order': 'num_products'})

# Filter to orders where num_products > 1
multi_product_order_agg = multi_product_order_agg[multi_product_order_agg['num_products'] > 1]

# Merge with orders data
data_multi = data.merge(multi_product_order_agg, on='order_id', how='inner')
# Group by product_name and get the top 300 products purchased (count of unique order ids)
top_products = data_multi.groupby('product_name')['order_id'].nunique().nlargest(1000).index.tolist()

# Filter the dataframe to include only the top 300 products
data_multi = data_multi[data_multi['product_name'].isin(top_products)]

In [14]:
product_crosstab = data_multi.pivot_table(index='order_id', columns='product_name', values='product_id', aggfunc='count', fill_value=0)
product_crosstab = (product_crosstab > 0).astype(int)
product_crosstab

product_name,0% Fat Free Organic Milk,0% Greek Strained Yogurt,1% Low Fat Milk,1% Lowfat Milk,100 Calorie Per Bag Popcorn,100% Apple Juice,100% Natural Spring Water,100% Raw Coconut Water,100% Recycled Bath Tissue Rolls,100% Recycled Paper Towels,...,YoKids Blueberry & Strawberry/Vanilla Yogurt,YoKids Squeeze! Organic Strawberry Flavor Yogurt,"YoKids Squeezers Organic Low-Fat Yogurt, Strawberry",YoKids Strawberry Banana/Strawberry Yogurt,Yobaby Organic Plain Yogurt,"Yogurt, Strained Low-Fat, Coconut",Yotoddler Organic Pear Spinach Mango Yogurt,Yukon Gold Potatoes 5lb Bag,ZBar Organic Chocolate Brownie Energy Snack,Zero Calorie Cola
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3416036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3417091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3417095,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3419974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## top products association pattern mining

In [15]:
# Generate frequent itemsets using fpgrowth
frequent_products = fpgrowth(product_crosstab, min_support=0.01, use_colnames=True)

# Generate association rules
rules_product = association_rules(frequent_products, metric="lift", min_threshold=1.0)

# Display the main metrics
rules_product[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']]
rules_product



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Banana),(Large Lemon),0.1705,0.046002,0.013509,0.079229,1.72229,1.0,0.005665,1.036086,0.505579,0.066547,0.034829,0.18644
1,(Large Lemon),(Banana),0.046002,0.1705,0.013509,0.293651,1.72229,1.0,0.005665,1.174348,0.4396,0.066547,0.148464,0.18644
2,(Organic Raspberries),(Organic Strawberries),0.047097,0.083607,0.011683,0.248062,2.966995,1.0,0.007745,1.218708,0.695726,0.09816,0.179459,0.1939
3,(Organic Strawberries),(Organic Raspberries),0.083607,0.047097,0.011683,0.139738,2.966995,1.0,0.007745,1.107689,0.723444,0.09816,0.097219,0.1939
4,(Banana),(Organic Baby Spinach),0.1705,0.087623,0.023001,0.134904,1.539588,1.0,0.008061,1.054653,0.422514,0.097826,0.051821,0.198702
5,(Organic Baby Spinach),(Banana),0.087623,0.1705,0.023001,0.2625,1.539588,1.0,0.008061,1.124746,0.384135,0.097826,0.11091,0.198702
6,(Bag of Organic Bananas),(Organic Baby Spinach),0.124133,0.087623,0.017525,0.141176,1.611176,1.0,0.006648,1.062357,0.433097,0.090226,0.058696,0.170588
7,(Organic Baby Spinach),(Bag of Organic Bananas),0.087623,0.124133,0.017525,0.2,1.611176,1.0,0.006648,1.094834,0.415766,0.090226,0.086619,0.170588
8,(Banana),(Cucumber Kirby),0.1705,0.032859,0.012778,0.074946,2.280871,1.0,0.007176,1.045498,0.676999,0.06705,0.043518,0.231918
9,(Cucumber Kirby),(Banana),0.032859,0.1705,0.012778,0.388889,2.280871,1.0,0.007176,1.357363,0.58065,0.06705,0.263278,0.231918
