# DoorLoop CX Data - Exploratory Analysis

This notebook performs exploratory data analysis on DoorLoop's customer and event data to uncover insights about customer health, retention, and churn risk.

**Author**: CX Analyst
**Date**: 2025
**Purpose**: Interview Assignment - Senior CX Analyst Role

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta

# Custom modules
from utils.db_connector import DataConnector
from utils.data_processor import CXDataProcessor

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load data from Snowflake or local CSV
with DataConnector() as db:
    users_df = db.load_users()
    events_df = db.load_events()

print(f"Users: {len(users_df):,} records")
print(f"Events: {len(events_df):,} records")

# Display sample
print("\nUsers Sample:")
display(users_df.head())

print("\nEvents Sample:")
display(events_df.head())

## 2. Data Quality Assessment

In [None]:
# Check for missing values
print("=== Missing Values in Users ===")
print(users_df.isnull().sum())

print("\n=== Missing Values in Events ===")
print(events_df.isnull().sum())

# Data types
print("\n=== Users Data Types ===")
print(users_df.dtypes)

# Basic statistics
print("\n=== Users Statistics ===")
display(users_df.describe())

## 3. User Base Overview

In [None]:
# Plan distribution
fig = px.pie(users_df, names='plan_type', title='User Distribution by Plan Type')
fig.show()

# Active vs Inactive
active_status = users_df['is_active'].value_counts()
print(f"\nActive Users: {active_status.get(1, 0):,}")
print(f"Inactive Users: {active_status.get(0, 0):,}")
print(f"Churn Rate: {(active_status.get(0, 0) / len(users_df)) * 100:.2f}%")

## 4. Revenue Analysis

In [None]:
# ARR by plan type
arr_by_plan = users_df.groupby('plan_type')['annual_revenue'].agg(['sum', 'mean', 'count'])
print("=== ARR by Plan Type ===")
display(arr_by_plan)

# Visualization
fig = px.box(users_df, x='plan_type', y='annual_revenue', 
             title='ARR Distribution by Plan Type',
             color='plan_type')
fig.show()

# Portfolio size vs ARR correlation
correlation = users_df[['portfolio_size', 'annual_revenue']].corr()
print(f"\nCorrelation between Portfolio Size and ARR: {correlation.iloc[0,1]:.3f}")

## 5. Event Analysis

In [None]:
# Event type distribution
event_counts = events_df['event_type'].value_counts()
print("=== Event Type Distribution ===")
display(event_counts)

# Visualize
fig = px.bar(x=event_counts.index, y=event_counts.values,
             title='Event Type Distribution',
             labels={'x': 'Event Type', 'y': 'Count'})
fig.update_xaxis(tickangle=-45)
fig.show()

# Events over time
events_df['event_date'] = pd.to_datetime(events_df['event_ts']).dt.date
daily_events = events_df.groupby('event_date').size()

fig = px.line(x=daily_events.index, y=daily_events.values,
              title='Daily Event Volume',
              labels={'x': 'Date', 'y': 'Number of Events'})
fig.show()

## 6. NPS Analysis

In [None]:
# NPS distribution
fig = px.histogram(users_df, x='nps_score', nbins=30,
                   title='NPS Score Distribution')
fig.show()

# NPS by plan type
nps_by_plan = users_df.groupby('plan_type')['nps_score'].mean().sort_values(ascending=False)
print("\n=== Average NPS by Plan Type ===")
display(nps_by_plan)

# Promoters, Passives, Detractors
users_df['nps_category'] = pd.cut(users_df['nps_score'], 
                                   bins=[-101, 0, 50, 100],
                                   labels=['Detractor', 'Passive', 'Promoter'])
nps_segments = users_df['nps_category'].value_counts()
print("\n=== NPS Segments ===")
display(nps_segments)

## 7. Support Ticket Analysis

In [None]:
# Support tickets distribution
fig = px.histogram(users_df, x='support_tickets_last_90d',
                   title='Support Tickets Distribution (Last 90 Days)')
fig.show()

# Support tickets vs NPS
fig = px.scatter(users_df, x='support_tickets_last_90d', y='nps_score',
                 color='plan_type', size='annual_revenue',
                 title='Support Tickets vs NPS Score')
fig.show()

# High support burden accounts
high_support = users_df[users_df['support_tickets_last_90d'] > 10]
print(f"\nAccounts with >10 support tickets: {len(high_support)}")
print(f"Average NPS of high-support accounts: {high_support['nps_score'].mean():.1f}")

## 8. Churn Analysis

In [None]:
# Churn by plan type
churn_by_plan = users_df.groupby('plan_type').agg({
    'is_active': [lambda x: (x==0).sum(), 'count']
})
churn_by_plan['churn_rate'] = (churn_by_plan[('is_active', '<lambda>')] / 
                                churn_by_plan[('is_active', 'count')]) * 100
print("=== Churn Rate by Plan Type ===")
display(churn_by_plan)

# Churned vs Active comparison
comparison = users_df.groupby('is_active').agg({
    'annual_revenue': 'mean',
    'nps_score': 'mean',
    'support_tickets_last_90d': 'mean',
    'portfolio_size': 'mean'
})
comparison.index = ['Churned', 'Active']
print("\n=== Churned vs Active Comparison ===")
display(comparison)

## 9. Build Comprehensive Metrics

In [None]:
# Process data using CXDataProcessor
processor = CXDataProcessor(users_df, events_df)
master_df = processor.build_master_table()

print("Master metrics table created!")
print(f"Shape: {master_df.shape}")
display(master_df.head())

## 10. Key Insights & Recommendations

In [None]:
print("=== KEY CX INSIGHTS ===")
print("\n1. HEALTH DISTRIBUTION")
health_dist = master_df['health_tier'].value_counts()
display(health_dist)

print("\n2. AT-RISK ACCOUNTS")
at_risk = master_df[master_df['at_renewal_risk'] == 1]
print(f"Total at-risk accounts: {len(at_risk)}")
print(f"ARR at risk: ${at_risk['annual_revenue'].sum():,.0f}")

print("\n3. TOP CHURN DRIVERS")
print("- Low engagement (days since last activity)")
print("- High support ticket volume")
print("- Low NPS scores")
print("- Lack of core feature adoption")

print("\n4. EXPANSION OPPORTUNITIES")
expansion = master_df[(master_df['health_tier'] == 'Green') & 
                      (master_df['plan_type'].isin(['starter', 'pro']))]
print(f"High-health lower-tier accounts: {len(expansion)}")
print(f"Potential expansion ARR: ${expansion['annual_revenue'].sum():,.0f}")

## Summary

This exploratory analysis reveals:

1. **Health Status**: Distribution of customer health across Red/Yellow/Green tiers
2. **Churn Risk**: Identified at-risk accounts for proactive intervention
3. **Revenue Opportunity**: Expansion potential in healthy lower-tier accounts
4. **Key Drivers**: Support burden, NPS, and engagement are primary health indicators

Next steps: Build churn prediction model and create actionable playbooks.