In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
events = pd.read_csv('dataset/events.csv')
# Load both parts
items_part1 = pd.read_csv('dataset/item_properties_part1.csv')
items_part2 = pd.read_csv('dataset/item_properties_part2.csv')

# Concatenate them vertically (assuming same columns)
items = pd.concat([items_part1, items_part2], ignore_index=True)

# Create sessions (flows)
# Group events by visitor within time windows (e.g., 30 min inactivity = new session)
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
events = events.sort_values(['visitorid', 'timestamp'])

def create_sessions(df, timeout_minutes=30):
    df['time_diff'] = df.groupby('visitorid')['timestamp'].diff()
    df['new_session'] = (df['time_diff'] > pd.Timedelta(minutes=timeout_minutes)) | df['time_diff'].isna()
    df['sessionid'] = df.groupby('visitorid')['new_session'].cumsum()
    return df

events = create_sessions(events)

In [4]:
#feature engineering
def extract_flow_features(events_df):
    """
    Extract features for each session (flow)
    Similar to flow statistics in network traffic
    """
    flow_features = events_df.groupby(['visitorid', 'sessionid']).agg({
        'timestamp': ['min', 'max', 'count'],  # Session start, end, event count
        'itemid': 'nunique',  # Unique items viewed
        'event': lambda x: (x == 'view').sum(),  # View count
    }).reset_index()
    
    # Flatten column names
    flow_features.columns = ['visitorid', 'sessionid', 'start_time', 'end_time', 
                              'event_count', 'unique_items', 'view_count']
    
    # Calculate derived features
    flow_features['duration_seconds'] = (
        flow_features['end_time'] - flow_features['start_time']
    ).dt.total_seconds()
    
    # Add behavioral features
    addtocart_counts = events_df[events_df['event'] == 'addtocart'].groupby(
        ['visitorid', 'sessionid']
    ).size().reset_index(name='addtocart_count')
    
    transaction_counts = events_df[events_df['event'] == 'transaction'].groupby(
        ['visitorid', 'sessionid']
    ).size().reset_index(name='transaction_count')
    
    flow_features = flow_features.merge(addtocart_counts, on=['visitorid', 'sessionid'], how='left')
    flow_features = flow_features.merge(transaction_counts, on=['visitorid', 'sessionid'], how='left')
    flow_features = flow_features.fillna(0)
    
    # Calculate ratios
    flow_features['view_to_cart_ratio'] = flow_features['addtocart_count'] / (flow_features['view_count'] + 1)
    flow_features['cart_to_purchase_ratio'] = flow_features['transaction_count'] / (flow_features['addtocart_count'] + 1)
    flow_features['events_per_minute'] = flow_features['event_count'] / (flow_features['duration_seconds'] / 60 + 1)
    
    # Time-based features
    flow_features['hour_of_day'] = flow_features['start_time'].dt.hour
    flow_features['day_of_week'] = flow_features['start_time'].dt.dayofweek
    
    return flow_features

flow_data = extract_flow_features(events)
flow_data.to_csv("flow_features.csv", index=False)  # Add this for output like the first notebook

In [5]:
import pandas as pd
import numpy as np


# ============================================================================
# PHASE 4: ASSOCIATION RULE MINING
# ============================================================================

print("="*80)
print("PHASE 4: ASSOCIATION RULE MINING")
print("="*80)

# Load the clustered flow data from R
flow_features_clustered = pd.read_csv('flow_features_clustered.csv')
print(f"\nLoaded clustered flow data: {flow_features_clustered.shape}")
print(f"Clusters found: {flow_features_clustered['cluster'].nunique()}")
print(f"\nCluster distribution:\n{flow_features_clustered['cluster'].value_counts().sort_index()}")

# Also load original events data for transaction-level rules
events = pd.read_csv('events.csv')
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
print(f"\nLoaded events data: {events.shape}")

# ----------------------------------------------------------------------------
# 4.1: Product-Level Association Rules (Market Basket Analysis)
# ----------------------------------------------------------------------------
print("\n" + "-"*80)
print("4.1: PRODUCT-LEVEL ASSOCIATION RULES (MARKET BASKET ANALYSIS)")
print("-"*80)

# Get transactions only
transactions = events[events['event'] == 'transaction'].copy()
print(f"\nTotal transactions: {transactions['transactionid'].nunique()}")

# Group items by transaction
transaction_lists = transactions.groupby('transactionid')['itemid'].apply(list).values

# Filter transactions with at least 2 items
transaction_lists = [t for t in transaction_lists if len(t) >= 2]
print(f"Transactions with 2+ items: {len(transaction_lists)}")

if len(transaction_lists) > 0:
    # Encode transactions
    te = TransactionEncoder()
    te_ary = te.fit(transaction_lists).transform(transaction_lists)
    df_transactions_encoded = pd.DataFrame(te_ary, columns=te.columns_)
    
    # Apply Apriori algorithm
    print("\nMining frequent itemsets...")
    frequent_itemsets_products = apriori(df_transactions_encoded, 
                                         min_support=0.005,  # Adjust based on your data
                                         use_colnames=True,
                                         max_len=3)
    
    print(f"Frequent itemsets found: {len(frequent_itemsets_products)}")
    
    if len(frequent_itemsets_products) > 0:
        # Generate association rules
        print("Generating association rules...")
        product_rules = association_rules(frequent_itemsets_products, 
                                         metric="confidence", 
                                         min_threshold=0.3)
        
        # Add lift and conviction metrics
        product_rules['lift'] = product_rules['lift']
        product_rules = product_rules.sort_values('lift', ascending=False)
        
        print(f"\nProduct Association Rules Generated: {len(product_rules)}")
        print("\nTop 10 Product Rules by Lift:")
        print(product_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))
        
        # Save product rules
        product_rules.to_csv('product_association_rules.csv', index=False)
        print("\n✓ Saved to 'product_association_rules.csv'")
    else:
        print("No rules generated - support threshold may be too high")
        product_rules = pd.DataFrame()
else:
    print("Not enough multi-item transactions for product association rules")
    product_rules = pd.DataFrame()

# ----------------------------------------------------------------------------
# 4.2: Category-Level Association Rules (if item properties available)
# ----------------------------------------------------------------------------
print("\n" + "-"*80)
print("4.2: CATEGORY-LEVEL ASSOCIATION RULES")
print("-"*80)

try:
    item_properties = pd.read_csv('item_properties.csv')
    
    # Get category information for items
    categories = item_properties[item_properties['property'] == 'categoryid'].copy()
    categories = categories[['itemid', 'value']].rename(columns={'value': 'categoryid'})
    categories = categories.drop_duplicates('itemid')
    
    # Merge with transactions
    transactions_with_cat = transactions.merge(categories, on='itemid', how='left')
    
    # Group categories by transaction
    category_transactions = transactions_with_cat.groupby('transactionid')['categoryid'].apply(
        lambda x: list(x.dropna().unique())
    ).values
    
    category_transactions = [t for t in category_transactions if len(t) >= 2]
    print(f"\nTransactions with 2+ categories: {len(category_transactions)}")
    
    if len(category_transactions) > 0:
        # Encode category transactions
        te_cat = TransactionEncoder()
        te_cat_ary = te_cat.fit(category_transactions).transform(category_transactions)
        df_cat_encoded = pd.DataFrame(te_cat_ary, columns=te_cat.columns_)
        
        # Apply Apriori
        frequent_itemsets_categories = apriori(df_cat_encoded, 
                                               min_support=0.01, 
                                               use_colnames=True,
                                               max_len=3)
        
        print(f"Frequent category itemsets: {len(frequent_itemsets_categories)}")
        
        if len(frequent_itemsets_categories) > 0:
            category_rules = association_rules(frequent_itemsets_categories, 
                                              metric="confidence", 
                                              min_threshold=0.3)
            category_rules = category_rules.sort_values('lift', ascending=False)
            
            print(f"\nCategory Association Rules: {len(category_rules)}")
            print("\nTop 10 Category Rules by Lift:")
            print(category_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))
            
            category_rules.to_csv('category_association_rules.csv', index=False)
            print("\n✓ Saved to 'category_association_rules.csv'")
        else:
            category_rules = pd.DataFrame()
    else:
        category_rules = pd.DataFrame()
        
except FileNotFoundError:
    print("item_properties.csv not found - skipping category-level rules")
    category_rules = pd.DataFrame()

# ----------------------------------------------------------------------------
# 4.3: Behavioral Pattern Association Rules (Cluster-based)
# ----------------------------------------------------------------------------
print("\n" + "-"*80)
print("4.3: BEHAVIORAL PATTERN ASSOCIATION RULES (CLUSTER-BASED)")
print("-"*80)

# Create behavioral features
flow_features_clustered['is_converter'] = (flow_features_clustered['transaction_count'] > 0).astype(int)
flow_features_clustered['high_engagement'] = (
    flow_features_clustered['event_count'] > flow_features_clustered['event_count'].median()
).astype(int)
flow_features_clustered['cart_user'] = (flow_features_clustered['addtocart_count'] > 0).astype(int)
flow_features_clustered['long_session'] = (
    flow_features_clustered['duration_seconds'] > flow_features_clustered['duration_seconds'].median()
).astype(int)
flow_features_clustered['multi_item_viewer'] = (
    flow_features_clustered['unique_items'] > flow_features_clustered['unique_items'].median()
).astype(int)

# Create cluster dummy variables
cluster_dummies = pd.get_dummies(flow_features_clustered['cluster'], prefix='cluster')

# Combine all behavioral features
behavior_features = pd.concat([
    cluster_dummies,
    flow_features_clustered[['high_engagement', 'cart_user', 'is_converter', 
                             'long_session', 'multi_item_viewer']]
], axis=1)

print(f"\nBehavioral feature matrix shape: {behavior_features.shape}")
print(f"Features: {list(behavior_features.columns)}")

# Mine behavioral patterns
print("\nMining behavioral patterns...")
behavior_itemsets = apriori(behavior_features, 
                           min_support=0.05,  # At least 5% of sessions
                           use_colnames=True,
                           max_len=4)

print(f"Frequent behavioral patterns: {len(behavior_itemsets)}")

if len(behavior_itemsets) > 0:
    # Generate behavioral rules
    behavior_rules = association_rules(behavior_itemsets, 
                                      metric="lift", 
                                      min_threshold=1.2)
    
    # Filter for interesting rules (rules that predict conversion or cart usage)
    interesting_consequents = ['is_converter', 'cart_user']
    interesting_rules = behavior_rules[
        behavior_rules['consequents'].apply(
            lambda x: any(item in interesting_consequents for item in x)
        )
    ].sort_values('lift', ascending=False)
    
    print(f"\nBehavioral Association Rules: {len(behavior_rules)}")
    print(f"Rules predicting conversion/cart: {len(interesting_rules)}")
    
    print("\nTop 15 Behavioral Rules (Predicting Conversion/Cart):")
    print(interesting_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(15))
    
    # Save behavioral rules
    behavior_rules.to_csv('behavioral_association_rules.csv', index=False)
    interesting_rules.to_csv('conversion_prediction_rules.csv', index=False)
    print("\n✓ Saved to 'behavioral_association_rules.csv'")
    print("✓ Saved to 'conversion_prediction_rules.csv'")
else:
    behavior_rules = pd.DataFrame()
    interesting_rules = pd.DataFrame()

# ============================================================================
# PHASE 5: REFINEMENT - RULE-BASED FEATURE ENGINEERING & RE-CLUSTERING
# ============================================================================

print("\n" + "="*80)
print("PHASE 5: REFINEMENT - RULE-BASED FEATURE ENGINEERING")
print("="*80)

# ----------------------------------------------------------------------------
# 5.1: Create Rule-Based Features
# ----------------------------------------------------------------------------
print("\n" + "-"*80)
print("5.1: CREATING RULE-BASED FEATURES")
print("-"*80)

def apply_rule_as_feature(df, rule, rule_id):
    """
    Apply an association rule as a binary feature
    Returns True if all antecedents are present
    """
    antecedents = list(rule['antecedents'])
    
    # Check if all antecedent columns exist
    missing_cols = [col for col in antecedents if col not in df.columns]
    if missing_cols:
        return pd.Series([0] * len(df), index=df.index)
    
    # Check if all antecedents are True (1)
    feature = df[antecedents].all(axis=1).astype(int)
    return feature

# Apply top behavioral rules as features
if len(interesting_rules) > 0:
    top_n_rules = min(15, len(interesting_rules))  # Use top 15 rules or fewer
    
    print(f"\nApplying top {top_n_rules} behavioral rules as features...")
    
    for idx, (rule_idx, rule) in enumerate(interesting_rules.head(top_n_rules).iterrows()):
        feature_name = f'rule_pattern_{idx+1}'
        flow_features_clustered[feature_name] = apply_rule_as_feature(
            behavior_features, rule, idx
        )
        
        # Print rule interpretation
        antecedents_str = ', '.join([str(x) for x in rule['antecedents']])
        consequents_str = ', '.join([str(x) for x in rule['consequents']])
        print(f"  {feature_name}: IF {antecedents_str} THEN {consequents_str} "
              f"(conf={rule['confidence']:.2f}, lift={rule['lift']:.2f})")
    
    rule_feature_cols = [col for col in flow_features_clustered.columns if 'rule_pattern_' in col]
    print(f"\n✓ Created {len(rule_feature_cols)} rule-based features")
else:
    rule_feature_cols = []
    print("\nNo behavioral rules available for feature creation")

# ----------------------------------------------------------------------------
# 5.2: Re-clustering with Enriched Features
# ----------------------------------------------------------------------------
print("\n" + "-"*80)
print("5.2: RE-CLUSTERING WITH ENRICHED FEATURES")
print("-"*80)

# Define original clustering features
original_feature_cols = ['duration_seconds', 'event_count', 'unique_items', 
                         'view_count', 'addtocart_count', 'transaction_count',
                         'view_to_cart_ratio', 'cart_to_purchase_ratio', 
                         'events_per_minute']

# Combine original + rule-based features
enriched_feature_cols = original_feature_cols + rule_feature_cols

print(f"\nOriginal features: {len(original_feature_cols)}")
print(f"Rule-based features: {len(rule_feature_cols)}")
print(f"Total enriched features: {len(enriched_feature_cols)}")

# Prepare data for re-clustering
X_enriched = flow_features_clustered[enriched_feature_cols].fillna(0)

# Standardize
scaler = StandardScaler()
X_enriched_scaled = scaler.fit_transform(X_enriched)

# Determine optimal clusters using BIC (same as Phase 3 but with enriched features)
print("\nFinding optimal number of clusters with BIC...")
bic_scores = []
n_components_range = range(2, 11)

for n_components in n_components_range:
    gmm = GaussianMixture(n_components=n_components, 
                         covariance_type='full', 
                         random_state=42, 
                         n_init=10,
                         max_iter=200)
    gmm.fit(X_enriched_scaled)
    bic_scores.append(gmm.bic(X_enriched_scaled))
    print(f"  {n_components} clusters: BIC = {bic_scores[-1]:.2f}")

optimal_clusters_refined = n_components_range[np.argmin(bic_scores)]
print(f"\n✓ Optimal number of clusters (refined): {optimal_clusters_refined}")

# Fit final refined model
print("\nFitting refined clustering model...")
refined_gmm = GaussianMixture(n_components=optimal_clusters_refined, 
                              covariance_type='full', 
                              random_state=42,
                              n_init=10,
                              max_iter=200)
flow_features_clustered['refined_cluster'] = refined_gmm.fit_predict(X_enriched_scaled)

print(f"\nRefined cluster distribution:\n{flow_features_clustered['refined_cluster'].value_counts().sort_index()}")

# ----------------------------------------------------------------------------
# 5.3: Compare Original vs Refined Clusters
# ----------------------------------------------------------------------------
print("\n" + "-"*80)
print("5.3: COMPARING ORIGINAL VS REFINED CLUSTERS")
print("-"*80)

# Cross-tabulation
cluster_comparison = pd.crosstab(
    flow_features_clustered['cluster'], 
    flow_features_clustered['refined_cluster'],
    margins=True
)
print("\nCluster Mapping (Original → Refined):")
print(cluster_comparison)

# Analyze conversion rates by cluster
original_conversion = flow_features_clustered.groupby('cluster').agg({
    'is_converter': 'mean',
    'cart_user': 'mean',
    'transaction_count': 'sum'
}).round(3)

refined_conversion = flow_features_clustered.groupby('refined_cluster').agg({
    'is_converter': 'mean',
    'cart_user': 'mean',
    'transaction_count': 'sum'
}).round(3)

print("\nOriginal Cluster Performance:")
print(original_conversion)

print("\nRefined Cluster Performance:")
print(refined_conversion)

# ----------------------------------------------------------------------------
# 5.4: Save Final Results
# ----------------------------------------------------------------------------
print("\n" + "-"*80)
print("5.4: SAVING FINAL RESULTS")
print("-"*80)

# Save enriched flow features with both cluster assignments
flow_features_clustered.to_csv('flow_features_refined.csv', index=False)
print("✓ Saved 'flow_features_refined.csv'")

# Create cluster profiles
cluster_profiles = flow_features_clustered.groupby('refined_cluster')[
    original_feature_cols + ['is_converter', 'cart_user']
].mean().round(3)
cluster_profiles['count'] = flow_features_clustered.groupby('refined_cluster').size()
cluster_profiles.to_csv('refined_cluster_profiles.csv')
print("✓ Saved 'refined_cluster_profiles.csv'")

# Summary statistics
summary = {
    'total_sessions': len(flow_features_clustered),
    'original_clusters': flow_features_clustered['cluster'].nunique(),
    'refined_clusters': flow_features_clustered['refined_cluster'].nunique(),
    'product_rules': len(product_rules) if len(product_rules) > 0 else 0,
    'behavioral_rules': len(behavior_rules) if len(behavior_rules) > 0 else 0,
    'rule_features_created': len(rule_feature_cols),
    'overall_conversion_rate': flow_features_clustered['is_converter'].mean(),
}

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
for key, value in summary.items():
    print(f"{key}: {value}")

print("\n✓ Phase 4 & 5 Complete!")
print("\nGenerated Files:")
print("  - product_association_rules.csv")
print("  - behavioral_association_rules.csv")
print("  - conversion_prediction_rules.csv")
print("  - flow_features_refined.csv")
print("  - refined_cluster_profiles.csv")

PHASE 4: ASSOCIATION RULE MINING

Loaded clustered flow data: (1761675, 16)
Clusters found: 1

Cluster distribution:
cluster
1    1761675
Name: count, dtype: int64

Loaded events data: (2756101, 5)

--------------------------------------------------------------------------------
4.1: PRODUCT-LEVEL ASSOCIATION RULES (MARKET BASKET ANALYSIS)
--------------------------------------------------------------------------------

Total transactions: 17672
Transactions with 2+ items: 2710

Mining frequent itemsets...
Frequent itemsets found: 10
Generating association rules...

Product Association Rules Generated: 2

Top 10 Product Rules by Lift:
  antecedents consequents   support  confidence       lift
0    (213834)    (445351)  0.014391    0.423913  29.456522
1    (445351)    (213834)  0.014391    1.000000  29.456522

✓ Saved to 'product_association_rules.csv'

--------------------------------------------------------------------------------
4.2: CATEGORY-LEVEL ASSOCIATION RULES
----------------



Frequent behavioral patterns: 15

Behavioral Association Rules: 36
Rules predicting conversion/cart: 0

Top 15 Behavioral Rules (Predicting Conversion/Cart):
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []

✓ Saved to 'behavioral_association_rules.csv'
✓ Saved to 'conversion_prediction_rules.csv'

PHASE 5: REFINEMENT - RULE-BASED FEATURE ENGINEERING

--------------------------------------------------------------------------------
5.1: CREATING RULE-BASED FEATURES
--------------------------------------------------------------------------------

No behavioral rules available for feature creation

--------------------------------------------------------------------------------
5.2: RE-CLUSTERING WITH ENRICHED FEATURES
--------------------------------------------------------------------------------

Original features: 9
Rule-based features: 0
Total enriched features: 9

Finding optimal number of clusters with BIC...


KeyboardInterrupt: 