# Pattern Analysis
https://web.ist.utl.pt/rmch/dash/guides/AssociationRuleMining%20in%20Python.html

Association Pattern Analysis

- What are the aisles most purchased together?
- Which aisles are most reordered together?
- Which aisles are most ordered isolately?

ARM Resources
- https://www.kaggle.com/code/datatheque/association-rules-mining-market-basket-analysis/notebook
- https://comum.rcaap.pt/bitstream/10400.26/37552/1/Tese_JoanaOliveira.pdf

# File and libraries

In [1]:
import pandas as pd
import numpy as np


filepath=r'data\instacart_pre_proc.csv'

file_tag = "Instacart Market Basket"


## DSLabs

In [2]:
%run "scripts/dslabs_functions.py"

# data functions

In [3]:
%run "scripts/data_functions.py"

data_functions lodaded


# Load

In [4]:
test_data=True
# test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data



if test_data==True:

    # data=pd.read_csv(filepath)
    data=pd.read_csv(r'data\instacart_pre_proc_sample.csv')

    # Apply the sampling to each group 0.5%
    # sample=0.1
    # data = sample_user_orders(data, fraction=sample)


else:
    data=pd.read_csv(r'data\instacart_pre_proc.csv')

    # Apply the sampling to each group 1%
    sample=0.05
    data = sample_user_orders(data, fraction=sample)



data=enrich_instacart_df(data)



data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6488743 entries, 0 to 6488742
Data columns (total 25 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   order_id                 int64  
 1   product_id               int64  
 2   add_to_cart_order        int64  
 3   reordered                int64  
 4   user_id                  int64  
 5   order_number             int64  
 6   order_dow                int64  
 7   order_hour_of_day        int64  
 8   days_since_prior_order   float64
 9   order_time_of_day        object 
 10  is_weekend               int64  
 11  is_peak_time_of_day      int64  
 12  weeks_since_prior_order  float64
 13  order_time_of_day_enc    float64
 14  order_hour_of_day_sin    float64
 15  order_hour_of_day_cos    float64
 16  order_dow_sin            float64
 17  order_dow_cos            float64
 18  num_products             int64  
 19  reorder_rate             float64
 20  product_name             object 
 21  aisle_id

## drop nulls

- this will drop all first buying orders from the dataframe

In [5]:
# drop any null values
data=data.dropna()

# aisle Order ID Mining
- group order ids and dummify aisle

In [6]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


aisle_df = data.pivot_table(index='order_id', columns='aisle', values='product_id', aggfunc='count', fill_value=0)
# Convert the DataFrame to boolean type
aisle_df = aisle_df.astype(bool)


## Top Aisles Association Pattern Mining

In [7]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

# Generate frequent itemsets using fpgrowth
frequent_itemsets = fpgrowth(aisle_df, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Display the main metrics
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']]

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
0,(packaged cheese),(refrigerated),0.037192,0.162491,1.217826,0.006652,1.034703
1,(refrigerated),(packaged cheese),0.037192,0.278746,1.217826,0.006652,1.069127
2,(refrigerated),(fresh fruits),0.086638,0.649328,1.162179,0.012090,1.258396
3,(fresh fruits),(refrigerated),0.086638,0.155066,1.162179,0.012090,1.025610
4,(yogurt),(refrigerated),0.047040,0.178045,1.334400,0.011788,1.054283
...,...,...,...,...,...,...,...
22285,(fresh fruits),"(dish detergents, fresh vegetables)",0.010436,0.018679,1.363320,0.002781,1.005073
22286,(missing),(fresh fruits),0.012543,0.656772,1.175503,0.001873,1.285690
22287,(fresh fruits),(missing),0.012543,0.022449,1.175503,0.001873,1.003429
22288,(missing),(fresh vegetables),0.011007,0.576341,1.296436,0.002517,1.311059


# Top Products Orders Mining

## Dataset Prep

In [8]:
# Group by order_id and aggregate features
multi_product_order_agg = data.groupby('order_id').agg({
    'add_to_cart_order': 'max',
}).rename(columns={'add_to_cart_order': 'num_products'})

# Filter to orders where num_products > 1
multi_product_order_agg = multi_product_order_agg[multi_product_order_agg['num_products'] > 1]

# Merge with orders data
data_multi = data.merge(multi_product_order_agg, on='order_id', how='inner')
# Group by product_name and get the top 300 products purchased (count of unique order ids)
top_products = data_multi.groupby('product_name')['order_id'].nunique().nlargest(1000).index.tolist()

# Filter the dataframe to include only the top 300 products
data_multi = data_multi[data_multi['product_name'].isin(top_products)]

In [9]:
product_crosstab = data_multi.pivot_table(index='order_id', columns='product_name', values='product_id', aggfunc='count', fill_value=0)
# product_crosstab = (product_crosstab > 0).astype(int)
# product_crosstab
product_crosstab = product_crosstab.astype(bool)


## top products association pattern mining

In [10]:
# Generate frequent itemsets using fpgrowth
frequent_products = fpgrowth(product_crosstab, min_support=0.01, use_colnames=True)

# Generate association rules
rules_product = association_rules(frequent_products, metric="lift", min_threshold=1.0)

# Display the main metrics
rules_product[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']]
rules_product

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Banana),(Organic Avocado),0.163928,0.059773,0.018223,0.111163,1.859736,1.0,0.008424,1.057816,0.55293,0.088684,0.054656,0.208013
1,(Organic Avocado),(Banana),0.059773,0.163928,0.018223,0.304863,1.859736,1.0,0.008424,1.202744,0.491679,0.088684,0.168568,0.208013
2,(Organic Baby Spinach),(Organic Avocado),0.083255,0.059773,0.010509,0.126228,2.111783,1.0,0.005533,1.076055,0.574278,0.079303,0.07068,0.151023
3,(Organic Avocado),(Organic Baby Spinach),0.059773,0.083255,0.010509,0.175817,2.111783,1.0,0.005533,1.112307,0.559936,0.079303,0.100968,0.151023
4,(Banana),(Organic Baby Spinach),0.163928,0.083255,0.017672,0.107802,1.294833,1.0,0.004024,1.027512,0.272344,0.076997,0.026776,0.160031
5,(Organic Baby Spinach),(Banana),0.083255,0.163928,0.017672,0.21226,1.294833,1.0,0.004024,1.061354,0.248378,0.076997,0.057808,0.160031
6,(Organic Baby Spinach),(Bag of Organic Bananas),0.083255,0.132449,0.017393,0.208917,1.577342,1.0,0.006366,1.096663,0.399263,0.087708,0.088143,0.17012
7,(Bag of Organic Bananas),(Organic Baby Spinach),0.132449,0.083255,0.017393,0.131322,1.577342,1.0,0.006366,1.055333,0.421903,0.087708,0.052432,0.17012
8,(Organic Strawberries),(Organic Baby Spinach),0.091822,0.083255,0.013049,0.14211,1.706915,1.0,0.005404,1.068604,0.45602,0.080534,0.064199,0.149421
9,(Organic Baby Spinach),(Organic Strawberries),0.083255,0.091822,0.013049,0.156733,1.706915,1.0,0.005404,1.076975,0.451759,0.080534,0.071473,0.149421
