# Pattern Analysis
https://web.ist.utl.pt/rmch/dash/guides/AssociationRuleMining%20in%20Python.html

Association Pattern Analysis

- What are the aisles most purchased together?
- Which aisles are most reordered together?
- Which aisles are most ordered isolately?

ARM Resources
- https://www.kaggle.com/code/datatheque/association-rules-mining-market-basket-analysis/notebook
- https://comum.rcaap.pt/bitstream/10400.26/37552/1/Tese_JoanaOliveira.pdf

# File and libraries

In [11]:
import pandas as pd
import numpy as np


filepath=r'data\instacart_pre_proc.csv'

file_tag = "Instacart Market Basket"


## DSLabs

In [12]:
%run "scripts/dslabs_functions.py"

# data functions

In [13]:
%run "scripts/data_functions.py"

data_functions lodaded


# Load

In [14]:
test_data=True
# test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data



if test_data==True:

    # data=pd.read_csv(filepath)
    data=pd.read_csv(r'data\instacart_pre_proc_sample.csv')

    # Apply the sampling to each group 0.5%
    sample=0.1
    data = sample_user_orders(data, fraction=sample)


else:
    data=pd.read_csv(r'data\instacart_pre_proc_sample.csv')

    # Apply the sampling to each group 1%
    # sample=0.05
    # data = sample_user_orders(data, fraction=sample)



data=enrich_instacart_df(data)



data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 648168 entries, 0 to 648167
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   order_id                 648168 non-null  int64  
 1   product_id               648168 non-null  int64  
 2   add_to_cart_order        648168 non-null  int64  
 3   reordered                648168 non-null  int64  
 4   user_id                  648168 non-null  int64  
 5   order_number             648168 non-null  int64  
 6   order_dow                648168 non-null  int64  
 7   order_hour_of_day        648168 non-null  int64  
 8   days_since_prior_order   606287 non-null  float64
 9   order_time_of_day        648168 non-null  object 
 10  is_weekend               648168 non-null  int64  
 11  weeks_since_prior_order  606287 non-null  float64
 12  order_time_of_day_enc    648168 non-null  float64
 13  order_hour_of_day_sin    648168 non-null  float64
 14  orde

## drop nulls

- this will drop all first buying orders from the dataframe

In [5]:
# drop any null values
data=data.dropna()

# aisle Order ID Mining
- group order ids and dummify aisle

In [8]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


aisle_df = data.pivot_table(index='order_id', columns='aisle', values='product_id', aggfunc='count', fill_value=0)
# Convert the DataFrame to boolean type
aisle_df = aisle_df.astype(bool)


## Top Aisles Association Pattern Mining

In [9]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

# Generate frequent itemsets using fpgrowth
frequent_itemsets = fpgrowth(aisle_df, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Display the main metrics
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']]

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
0,(fresh fruits),(packaged vegetables fruits),0.267542,0.481067,1.324864,0.065603,1.227313
1,(packaged vegetables fruits),(fresh fruits),0.267542,0.736816,1.324864,0.065603,1.686482
2,(fresh vegetables),(packaged vegetables fruits),0.231071,0.526848,1.450946,0.071816,1.346065
3,(packaged vegetables fruits),(fresh vegetables),0.231071,0.636372,1.450946,0.071816,1.543910
4,"(fresh fruits, fresh vegetables)",(packaged vegetables fruits),0.183089,0.586508,1.615250,0.069739,1.540279
...,...,...,...,...,...,...,...
21761,"(fresh fruits, fresh vegetables)",(tea),0.017887,0.057299,1.030790,0.000534,1.001816
21762,"(tea, fresh vegetables)",(fresh fruits),0.017887,0.750000,1.348571,0.004623,1.775423
21763,(fresh fruits),"(tea, fresh vegetables)",0.017887,0.032163,1.348571,0.004623,1.008589
21764,(tea),"(fresh fruits, fresh vegetables)",0.017887,0.321781,1.030790,0.000534,1.014172


# Top Products Orders Mining

## Dataset Prep

In [15]:
# Group by order_id and aggregate features
multi_product_order_agg = data.groupby('order_id').agg({
    'add_to_cart_order': 'max',
}).rename(columns={'add_to_cart_order': 'num_products'})

# Filter to orders where num_products > 1
multi_product_order_agg = multi_product_order_agg[multi_product_order_agg['num_products'] > 1]

# Merge with orders data
data_multi = data.merge(multi_product_order_agg, on='order_id', how='inner')
# Group by product_name and get the top 300 products purchased (count of unique order ids)
top_products = data_multi.groupby('product_name')['order_id'].nunique().nlargest(1000).index.tolist()

# Filter the dataframe to include only the top 300 products
data_multi = data_multi[data_multi['product_name'].isin(top_products)]

In [16]:
product_crosstab = data_multi.pivot_table(index='order_id', columns='product_name', values='product_id', aggfunc='count', fill_value=0)
# product_crosstab = (product_crosstab > 0).astype(int)
# product_crosstab
product_crosstab = product_crosstab.astype(bool)


## top products association pattern mining

In [17]:
# Generate frequent itemsets using fpgrowth
frequent_products = fpgrowth(product_crosstab, min_support=0.01, use_colnames=True)

# Generate association rules
rules_product = association_rules(frequent_products, metric="lift", min_threshold=1.0)

# Display the main metrics
rules_product[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']]
rules_product

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Organic Hass Avocado),(Organic Strawberries),0.072581,0.091279,0.013823,0.190453,2.086487,1.0,0.007198,1.122505,0.561478,0.092133,0.109136,0.170947
1,(Organic Strawberries),(Organic Hass Avocado),0.091279,0.072581,0.013823,0.15144,2.086487,1.0,0.007198,1.092932,0.573031,0.092133,0.08503,0.170947
2,(Organic Hass Avocado),(Bag of Organic Bananas),0.072581,0.130138,0.020474,0.282082,2.16756,1.0,0.011028,1.211645,0.580808,0.112342,0.174676,0.219703
3,(Bag of Organic Bananas),(Organic Hass Avocado),0.130138,0.072581,0.020474,0.157324,2.16756,1.0,0.011028,1.100564,0.619238,0.112342,0.091375,0.219703
4,(Organic Hass Avocado),(Organic Baby Spinach),0.072581,0.082488,0.011995,0.165267,2.003545,1.0,0.006008,1.099169,0.540085,0.08384,0.090222,0.155344
5,(Organic Baby Spinach),(Organic Hass Avocado),0.082488,0.072581,0.011995,0.14542,2.003545,1.0,0.006008,1.085233,0.545916,0.08384,0.078539,0.155344
6,(Bag of Organic Bananas),(Organic Raspberries),0.130138,0.046989,0.014032,0.107826,2.29471,1.0,0.007917,1.06819,0.648626,0.086038,0.063837,0.203228
7,(Organic Raspberries),(Bag of Organic Bananas),0.046989,0.130138,0.014032,0.298629,2.29471,1.0,0.007917,1.240231,0.592034,0.086038,0.193699,0.203228
8,(Organic Strawberries),(Organic Raspberries),0.091279,0.046989,0.011699,0.128171,2.72768,1.0,0.00741,1.093117,0.697011,0.092435,0.085185,0.188576
9,(Organic Raspberries),(Organic Strawberries),0.046989,0.091279,0.011699,0.248981,2.72768,1.0,0.00741,1.209984,0.664618,0.092435,0.173543,0.188576
