# State Transition Analysis - Exploration

Interactive notebook for exploring state transition data.

**State Definitions**:
| State | Trigger Logic |
|-------|---------------|
| 1. Exploring | First-time Facebook visitors |
| 2. Problem-Aware | Returning visitor OR non-Facebook traffic |
| 3. Purchase-Ready | Add to cart or begin checkout |
| 4. Purchased | Completed purchase |

In [None]:
# Setup - add parent directory to path for imports
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# Import analysis modules
from src.config import Config, STATE_NAMES
from src.data_loader import load_sessions_data, validate_data
from src.state_assignment import assign_states, get_state_summary
from src.metrics import (
    calculate_state_distribution,
    calculate_transition_matrix,
    calculate_cohort_metrics,
    calculate_channel_metrics,
    calculate_time_to_state,
    build_sankey_data,
)

## 1. Configuration

Adjust these parameters to explore different cohorts and time periods.

In [None]:
from datetime import date

# Configure analysis parameters
config = Config(
    # Cohort date range (None = use all data)
    cohort_start=date(2026, 1, 1),
    cohort_end=date(2026, 1, 31),    # Change to analyze different periods
    
    # Cohort granularity: 'D' (daily), 'W' (weekly), 'M' (monthly)
    cohort_granularity='W',
    
    # Minimum users per cohort for reporting
    min_cohort_size=50,
    
    # Traffic sources that indicate "Exploring" state
    exploring_traffic_sources=['Facebook'],
)

print(f"Cohort: {config.cohort_start} to {config.cohort_end}")
print(f"Granularity: {config.cohort_granularity}")

## 2. Load and Validate Data

In [None]:
# Load data
df = load_sessions_data('../data/', config)

# Validate
validation = validate_data(df)
print(f"Valid: {validation['valid']}")
print(f"Sessions: {validation['summary']['total_sessions']:,}")
print(f"Users: {validation['summary']['unique_users']:,}")
print(f"Cohort periods: {validation['summary']['cohort_periods']}")

In [None]:
# Assign states
df = assign_states(df, config)

# View state distribution
get_state_summary(df)

## 3. State Distribution by Cohort

In [None]:
# State distribution over time
state_dist = calculate_state_distribution(df, group_by='COHORT_PERIOD')
state_dist

In [None]:
# Pivot for easier visualization
state_pivot = state_dist.pivot(index='COHORT_PERIOD', columns='STATE_NAME', values='pct')
state_pivot = state_pivot[['Exploring', 'Problem-Aware', 'Purchase-Ready', 'Purchased']]
state_pivot

## 4. Transition Matrix

In [None]:
# Transition rates between states
transition_matrix = calculate_transition_matrix(df, normalize=True)
print("Transition Rates (% of previous state):")
transition_matrix

In [None]:
# Raw counts
transition_counts = calculate_transition_matrix(df, normalize=False)
print("Transition Counts:")
transition_counts

## 5. Cohort Metrics Over Time

In [None]:
# Metrics by cohort period
cohort_metrics = calculate_cohort_metrics(df, config)
cohort_metrics

In [None]:
# Plot cohort trends (if matplotlib available)
try:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Purchase rate over time
    axes[0, 0].plot(cohort_metrics['COHORT_PERIOD'], cohort_metrics['purchased_rate'], marker='o')
    axes[0, 0].set_title('Purchase Rate by Cohort')
    axes[0, 0].set_ylabel('%')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Purchase-ready rate over time
    axes[0, 1].plot(cohort_metrics['COHORT_PERIOD'], cohort_metrics['purchase_ready_rate'], marker='o', color='orange')
    axes[0, 1].set_title('Purchase-Ready Rate by Cohort')
    axes[0, 1].set_ylabel('%')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # Return rate over time
    axes[1, 0].plot(cohort_metrics['COHORT_PERIOD'], cohort_metrics['return_rate'], marker='o', color='green')
    axes[1, 0].set_title('Return Rate by Cohort')
    axes[1, 0].set_ylabel('%')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # Cohort size over time
    axes[1, 1].bar(cohort_metrics['COHORT_PERIOD'], cohort_metrics['total_users'], color='gray')
    axes[1, 1].set_title('Cohort Size')
    axes[1, 1].set_ylabel('Users')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
except ImportError:
    print("matplotlib not available for plotting")

## 6. Channel Analysis

In [None]:
# Metrics by acquisition channel
channel_metrics = calculate_channel_metrics(df, config)
channel_metrics

In [None]:
# Top channels by purchase-ready rate (with minimum user threshold)
top_channels = channel_metrics[channel_metrics['total_users'] >= config.min_cohort_size]
top_channels = top_channels.nlargest(10, 'purchase_ready_rate')

print(f"Top 10 channels by purchase-ready rate (min {config.min_cohort_size} users):")
top_channels[['FIRST_TOUCH_CHANNEL', 'total_users', 'purchase_ready_rate', 'purchase_rate', 'return_rate']]

## 7. Time to Purchase Analysis

In [None]:
# Time/sessions to reach Purchase state
time_to_purchase = calculate_time_to_state(df, target_state=4)

print(f"Users who purchased: {len(time_to_purchase):,}")
print(f"\nAvg sessions to purchase: {time_to_purchase['sessions_to_state'].mean():.2f}")
print(f"Median sessions to purchase: {time_to_purchase['sessions_to_state'].median():.1f}")
print(f"\nAvg days to purchase: {time_to_purchase['days_to_state'].mean():.1f}")
print(f"Median days to purchase: {time_to_purchase['days_to_state'].median():.1f}")

In [None]:
# Distribution of sessions to purchase
time_to_purchase['sessions_to_state'].value_counts().sort_index().head(10)

## 8. Sankey Flow Data

In [None]:
# Build Sankey data for visualization
sankey_data = build_sankey_data(df, max_sessions=3)

print(f"Sankey nodes: {len(sankey_data['nodes'])}")
print(f"Sankey links: {len(sankey_data['links'])}")

In [None]:
# Preview top flows
import pandas as pd

links_df = pd.DataFrame(sankey_data['links'])
nodes_df = pd.DataFrame(sankey_data['nodes'])

links_df['source_name'] = links_df['source'].map(nodes_df.set_index('id')['name'])
links_df['target_name'] = links_df['target'].map(nodes_df.set_index('id')['name'])

print("Top 15 flows:")
links_df.nlargest(15, 'value')[['source_name', 'target_name', 'value']]

## 9. Custom Analysis

Use this section for ad-hoc exploration.

In [None]:
# Example: Compare Facebook vs non-Facebook first-session users
session1 = df[df['SESSION_NUMBER'] == 1].copy()
session1['is_facebook'] = session1['SESSION_FIRST_TRAFFIC_SOURCE_CHANNEL_GROUPING'].str.startswith('Facebook', na=False)

comparison = session1.groupby('is_facebook').agg({
    'USER_ID': 'nunique',
    'STATE': lambda x: (x == 4).sum() / len(x) * 100  # purchase rate
}).round(2)

comparison.columns = ['users', 'session1_purchase_rate']
comparison.index = ['Non-Facebook', 'Facebook']
comparison

In [None]:
# Example: Explore specific cohort period
# Uncomment and modify as needed

# specific_cohort = df[df['COHORT_PERIOD'] == '2026-01']
# get_state_summary(specific_cohort)