In [ ]:
# Execute this cell to create comprehensive business dashboards with subplots

print("=== BUSINESS DASHBOARD CREATION ===\n")

# Create a comprehensive business dashboard
fig = plt.figure(figsize=(20, 15))
fig.suptitle('Executive Business Dashboard - 2023-2024 Performance', 
             fontsize=18, fontweight='bold', y=0.98)

# Dashboard Layout: 4x3 grid
gs = fig.add_gridspec(4, 3, hspace=0.3, wspace=0.3)

# 1. KPI Summary (Top Left - spans 2 columns)
ax1 = fig.add_subplot(gs[0, :2])
kpi_data = {
    'Total Revenue': business_data['Revenue'].sum(),
    'Avg Customer Satisfaction': business_data['Customer_Satisfaction'].mean(),
    'Total Units Sold': business_data['Units_Sold'].sum(),
    'Avg Profit Margin': business_data['Profit_Margin'].mean(),
}

kpi_names = list(kpi_data.keys())
kpi_values = list(kpi_data.values())
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']

bars = ax1.barh(kpi_names, kpi_values, color=colors, alpha=0.8)
ax1.set_title('Key Performance Indicators', fontsize=14, fontweight='bold')
ax1.set_xlabel('Values')

# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, kpi_values)):
    if i in [0, 2]:  # Format monetary and unit values
        label = f'${value:,.0f}' if i == 0 else f'{value:,.0f}'
    else:  # Format ratios
        label = f'{value:.2f}'
    ax1.text(bar.get_width() + max(kpi_values)*0.01, bar.get_y() + bar.get_height()/2,
             label, va='center', fontweight='bold')

# 2. Regional Performance (Top Right)
ax2 = fig.add_subplot(gs[0, 2])
regional_performance = business_data.groupby('Region').agg({
    'Revenue': 'sum',
    'Customer_Satisfaction': 'mean'
}).sort_values('Revenue', ascending=True)

bars = ax2.barh(regional_performance.index, regional_performance['Revenue'], 
                color='lightcoral', alpha=0.8)
ax2.set_title('Revenue by Region', fontsize=12, fontweight='bold')
ax2.set_xlabel('Total Revenue ($)')

for bar in bars:
    width = bar.get_width()
    ax2.text(width + width*0.01, bar.get_y() + bar.get_height()/2,
             f'${width:,.0f}', va='center', fontsize=9)

# 3. Product Performance Analysis (Second Row Left)
ax3 = fig.add_subplot(gs[1, 0])
product_metrics = business_data.groupby('Product').agg({
    'Revenue': 'mean',
    'Units_Sold': 'mean',
    'Profit_Margin': 'mean'
})

x = np.arange(len(product_metrics.index))
width = 0.25

ax3.bar(x - width, product_metrics['Revenue']/1000, width, label='Avg Revenue (K$)', alpha=0.8)
ax3.bar(x, product_metrics['Units_Sold'], width, label='Avg Units Sold', alpha=0.8)
ax3.bar(x + width, product_metrics['Profit_Margin']*100, width, label='Profit Margin (%)', alpha=0.8)

ax3.set_title('Product Performance Metrics', fontsize=12, fontweight='bold')
ax3.set_xlabel('Product Category')
ax3.set_xticks(x)
ax3.set_xticklabels(product_metrics.index, rotation=45)
ax3.legend(fontsize=8)

# 4. Time Series Trend (Second Row Middle)
ax4 = fig.add_subplot(gs[1, 1])
monthly_trend = business_data.groupby('Month')['Revenue'].mean()
ax4.plot(monthly_trend.index, monthly_trend.values, marker='o', linewidth=3, 
         markersize=8, color='green')
ax4.set_title('Monthly Revenue Trend', fontsize=12, fontweight='bold')
ax4.set_xlabel('Month')
ax4.set_ylabel('Avg Revenue ($)')
ax4.grid(True, alpha=0.3)
ax4.set_xticks(range(1, 13))

# 5. Customer Analysis (Second Row Right)
ax5 = fig.add_subplot(gs[1, 2])
age_bins = pd.cut(business_data['Customer_Age'], bins=5)
age_satisfaction = business_data.groupby(age_bins)['Customer_Satisfaction'].mean()

x_pos = range(len(age_satisfaction))
bars = ax5.bar(x_pos, age_satisfaction.values, color='skyblue', alpha=0.8)
ax5.set_title('Satisfaction by Age Group', fontsize=12, fontweight='bold')
ax5.set_xlabel('Age Group')
ax5.set_ylabel('Avg Satisfaction')
ax5.set_xticks(x_pos)
ax5.set_xticklabels([f'{interval.left:.0f}-{interval.right:.0f}' 
                     for interval in age_satisfaction.index], rotation=45)

# Add value labels
for bar in bars:
    height = bar.get_height()
    ax5.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.2f}', ha='center', va='bottom', fontsize=9)

# 6. Correlation Heatmap (Third Row Left)
ax6 = fig.add_subplot(gs[2, 0])
corr_data = business_data[['Revenue', 'Marketing_Spend', 'Customer_Satisfaction', 
                          'Profit_Margin', 'Units_Sold']].corr()
im = ax6.imshow(corr_data, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)

# Add correlation values
for i in range(len(corr_data.columns)):
    for j in range(len(corr_data.columns)):
        text = ax6.text(j, i, f'{corr_data.iloc[i, j]:.2f}',
                       ha='center', va='center', fontsize=8)

ax6.set_title('Correlation Matrix', fontsize=12, fontweight='bold')
ax6.set_xticks(range(len(corr_data.columns)))
ax6.set_yticks(range(len(corr_data.columns)))
ax6.set_xticklabels(corr_data.columns, rotation=45)
ax6.set_yticklabels(corr_data.columns)

# 7. Distribution Analysis (Third Row Middle)
ax7 = fig.add_subplot(gs[2, 1])
ax7.hist(business_data['Profit_Margin'], bins=20, alpha=0.7, color='orange', 
         edgecolor='black', linewidth=0.5)
ax7.axvline(business_data['Profit_Margin'].mean(), color='red', linestyle='--', 
            linewidth=2, label=f'Mean: {business_data["Profit_Margin"].mean():.2f}')
ax7.set_title('Profit Margin Distribution', fontsize=12, fontweight='bold')
ax7.set_xlabel('Profit Margin')
ax7.set_ylabel('Frequency')
ax7.legend()
ax7.grid(True, alpha=0.3)

# 8. Quarterly Comparison (Third Row Right)
ax8 = fig.add_subplot(gs[2, 2])
quarterly_comparison = business_data.groupby('Quarter').agg({
    'Revenue': 'sum',
    'Units_Sold': 'sum'
})

ax8_twin = ax8.twinx()
bars1 = ax8.bar(quarterly_comparison.index - 0.2, quarterly_comparison['Revenue']/1000, 
                0.4, label='Revenue (K$)', color='blue', alpha=0.7)
bars2 = ax8_twin.bar(quarterly_comparison.index + 0.2, quarterly_comparison['Units_Sold'], 
                     0.4, label='Units Sold', color='red', alpha=0.7)

ax8.set_title('Quarterly Performance', fontsize=12, fontweight='bold')
ax8.set_xlabel('Quarter')
ax8.set_ylabel('Revenue (K$)', color='blue')
ax8_twin.set_ylabel('Units Sold', color='red')
ax8.set_xticks(quarterly_comparison.index)

# 9. Department Efficiency (Bottom Row - spans all columns)
ax9 = fig.add_subplot(gs[3, :])
dept_efficiency = business_data.groupby('Department').agg({
    'Revenue': 'sum',
    'Customer_Satisfaction': 'mean',
    'Marketing_Spend': 'sum'
})

dept_efficiency['Efficiency'] = dept_efficiency['Revenue'] / dept_efficiency['Marketing_Spend']
dept_efficiency_sorted = dept_efficiency.sort_values('Efficiency', ascending=True)

bars = ax9.barh(dept_efficiency_sorted.index, dept_efficiency_sorted['Efficiency'], 
                color='purple', alpha=0.8)
ax9.set_title('Department Efficiency (Revenue per Marketing Dollar)', 
              fontsize=14, fontweight='bold')
ax9.set_xlabel('Efficiency Ratio')

# Add value labels
for bar in bars:
    width = bar.get_width()
    ax9.text(width + width*0.01, bar.get_y() + bar.get_height()/2,
             f'{width:.2f}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("=== SPECIALIZED SUBPLOT ARRANGEMENTS ===\n")

# Create different subplot layouts for various business needs
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Advanced Subplot Layouts for Business Analysis', fontsize=14, fontweight='bold')

# 1. Shared X-axis for time comparison
axes[0, 0].plot(monthly_trend.index, monthly_trend.values, 'b-', label='Revenue')
axes[0, 0].set_ylabel('Revenue ($)', color='blue')
axes[0, 0].tick_params(axis='y', labelcolor='blue')

ax_shared = axes[0, 0].twinx()
monthly_units = business_data.groupby('Month')['Units_Sold'].sum()
ax_shared.plot(monthly_units.index, monthly_units.values, 'r-', label='Units')
ax_shared.set_ylabel('Units Sold', color='red')
ax_shared.tick_params(axis='y', labelcolor='red')
axes[0, 0].set_title('Dual Y-Axis Comparison')
axes[0, 0].set_xlabel('Month')

# 2. Subplot with inset
axes[0, 1].scatter(business_data['Marketing_Spend'], business_data['Revenue'], 
                   alpha=0.6, c=business_data['Customer_Satisfaction'], cmap='viridis')
axes[0, 1].set_title('Revenue vs Marketing Spend')
axes[0, 1].set_xlabel('Marketing Spend ($)')
axes[0, 1].set_ylabel('Revenue ($)')

# Add colorbar
cbar = plt.colorbar(axes[0, 1].collections[0], ax=axes[0, 1])
cbar.set_label('Customer Satisfaction')

# 3. Comparison across categories
regional_products = business_data.groupby(['Region', 'Product'])['Revenue'].mean().unstack()
regional_products.plot(kind='bar', ax=axes[1, 0], width=0.8)
axes[1, 0].set_title('Average Revenue by Region and Product')
axes[1, 0].set_xlabel('Region')
axes[1, 0].set_ylabel('Average Revenue ($)')
axes[1, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1, 0].tick_params(axis='x', rotation=45)

# 4. Multi-metric dashboard summary
metrics = ['Revenue', 'Customer_Satisfaction', 'Profit_Margin', 'Units_Sold']
normalized_data = business_data[metrics].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

for i, metric in enumerate(metrics):
    axes[1, 1].hist(normalized_data[metric], alpha=0.6, label=metric, bins=15)

axes[1, 1].set_title('Normalized Distribution Comparison')
axes[1, 1].set_xlabel('Normalized Value (0-1)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("Subplot Benefits for Business Analysis:")
print("• Executive dashboards with multiple KPIs")
print("• Side-by-side comparisons of metrics")
print("• Time series analysis with multiple variables")
print("• Correlation and distribution analysis")
print("• Space-efficient report layouts")
print("• Professional presentation formats")

### Subplots for Dashboard Creation

**Subplots** allow you to create multiple plots in a single figure, which is essential for building business dashboards and comprehensive reports. This enables you to:

- **Compare multiple metrics** side by side
- **Show different views** of the same data
- **Create executive summaries** with key KPIs
- **Build interactive dashboards** for stakeholders
- **Optimize space** in reports and presentations

**Key Functions:**
- **`plt.subplots()`** - Create multiple subplot axes
- **`fig.add_subplot()`** - Add individual subplots
- **`plt.subplot()`** - Quick subplot creation
- **Grid layouts** - Organize plots in rows and columns
- **Shared axes** - Synchronize scales across plots

In [ ]:
# Create a line plot showing the 7-day moving average of Revenue over time using time_series_data.
# Calculate the average value of the moving average and assign to 'avg_ma7'.


avg_ma7 = time_series_data['Revenue_MA7'].mean()
assert not pd.isna(avg_ma7), "Moving average should not be NaN"
assert avg_ma7 > 0, "Moving average should be positive"
print(f"Correct! You created a time series plot. The average 7-day MA is ${avg_ma7:,.2f}")

In [ ]:
# Execute this cell to explore time series visualization

import matplotlib.dates as mdates
from datetime import datetime, timedelta

print("=== TIME SERIES DATA PREPARATION ===\n")

# Create more structured time series data
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 12, 31)
date_range = pd.date_range(start_date, end_date, freq='D')

# Generate realistic business time series
np.random.seed(42)
n_days = len(date_range)

# Create base trends with seasonality
trend = np.linspace(10000, 15000, n_days)
seasonal = 2000 * np.sin(2 * np.pi * np.arange(n_days) / 365.25)
weekly = 500 * np.sin(2 * np.pi * np.arange(n_days) / 7)
noise = np.random.normal(0, 800, n_days)

daily_revenue = trend + seasonal + weekly + noise
daily_revenue = np.clip(daily_revenue, 5000, 25000)  # Realistic bounds

# Create comprehensive time series dataset
time_series_data = pd.DataFrame({
    'Date': date_range,
    'Revenue': daily_revenue,
    'Marketing_Spend': np.random.normal(2000, 500, n_days).clip(500, 5000),
    'Customer_Acquisition': np.random.poisson(20, n_days),
    'Website_Visits': np.random.normal(1000, 200, n_days).clip(500, 2000)
})

# Add calculated metrics
time_series_data['Revenue_MA7'] = time_series_data['Revenue'].rolling(window=7).mean()
time_series_data['Revenue_MA30'] = time_series_data['Revenue'].rolling(window=30).mean()
time_series_data['ROI'] = time_series_data['Revenue'] / time_series_data['Marketing_Spend']

print("Time series dataset created with", len(time_series_data), "daily records")
print("Date range:", time_series_data['Date'].min(), "to", time_series_data['Date'].max())

# Create comprehensive time series visualizations
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
fig.suptitle('Business Time Series Analysis with Professional Formatting', 
             fontsize=16, fontweight='bold')

# 1. Basic time series with trend lines
plt.subplot(3, 2, 1)
plt.plot(time_series_data['Date'], time_series_data['Revenue'], 
         alpha=0.3, color='blue', label='Daily Revenue')
plt.plot(time_series_data['Date'], time_series_data['Revenue_MA7'], 
         color='orange', linewidth=2, label='7-Day MA')
plt.plot(time_series_data['Date'], time_series_data['Revenue_MA30'], 
         color='red', linewidth=2, label='30-Day MA')
plt.title('Daily Revenue with Moving Averages')
plt.xlabel('Date')
plt.ylabel('Revenue ($)')
plt.legend()
plt.grid(True, alpha=0.3)
# Format x-axis
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=3))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

# 2. Multiple metrics on same plot
plt.subplot(3, 2, 2)
ax1 = plt.gca()
ax2 = ax1.twinx()

# Revenue on left axis
line1 = ax1.plot(time_series_data['Date'], time_series_data['Revenue_MA7'], 
                 color='blue', linewidth=2, label='Revenue (7-day MA)')
ax1.set_xlabel('Date')
ax1.set_ylabel('Revenue ($)', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Marketing spend on right axis
line2 = ax2.plot(time_series_data['Date'], time_series_data['Marketing_Spend'], 
                 color='red', linewidth=2, label='Marketing Spend')
ax2.set_ylabel('Marketing Spend ($)', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title('Revenue vs Marketing Spend Over Time')
ax1.xaxis.set_major_locator(mdates.MonthLocator(interval=4))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

# Create combined legend
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

# 3. Seasonal decomposition view
plt.subplot(3, 2, 3)
monthly_data = time_series_data.groupby(time_series_data['Date'].dt.to_period('M')).agg({
    'Revenue': 'mean',
    'Marketing_Spend': 'mean',
    'Customer_Acquisition': 'sum'
}).reset_index()
monthly_data['Date'] = monthly_data['Date'].dt.to_timestamp()

plt.plot(monthly_data['Date'], monthly_data['Revenue'], 
         marker='o', linewidth=3, markersize=6, color='green')
plt.title('Monthly Average Revenue')
plt.xlabel('Date')
plt.ylabel('Average Revenue ($)')
plt.grid(True, alpha=0.3)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

# 4. Quarterly comparison
plt.subplot(3, 2, 4)
quarterly_data = time_series_data.copy()
quarterly_data['Quarter'] = quarterly_data['Date'].dt.to_period('Q')
quarterly_summary = quarterly_data.groupby('Quarter')['Revenue'].sum().reset_index()
quarterly_summary['Quarter_Str'] = quarterly_summary['Quarter'].astype(str)

bars = plt.bar(quarterly_summary['Quarter_Str'], quarterly_summary['Revenue'], 
               color='skyblue', alpha=0.8)
plt.title('Quarterly Revenue Totals')
plt.xlabel('Quarter')
plt.ylabel('Total Revenue ($)')
plt.xticks(rotation=45)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'${height:,.0f}', ha='center', va='bottom')

# 5. ROI analysis over time
plt.subplot(3, 2, 5)
plt.plot(time_series_data['Date'], time_series_data['ROI'].rolling(30).mean(), 
         color='purple', linewidth=2)
plt.title('Return on Investment (30-day MA)')
plt.xlabel('Date')
plt.ylabel('ROI Ratio')
plt.grid(True, alpha=0.3)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=3))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

# Add horizontal line for average ROI
avg_roi = time_series_data['ROI'].mean()
plt.axhline(y=avg_roi, color='red', linestyle='--', alpha=0.7, 
            label=f'Average ROI: {avg_roi:.2f}')
plt.legend()

# 6. Annotated events timeline
plt.subplot(3, 2, 6)
plt.plot(time_series_data['Date'], time_series_data['Revenue_MA7'], 
         color='blue', linewidth=2)

# Add annotations for important business events
event_dates = [
    (datetime(2023, 3, 15), 'Product Launch'),
    (datetime(2023, 7, 4), 'Summer Campaign'),
    (datetime(2023, 11, 24), 'Black Friday'),
    (datetime(2024, 1, 1), 'New Year Sale'),
    (datetime(2024, 6, 1), 'Mid-Year Review')
]

for date, event in event_dates:
    if date in time_series_data['Date'].values:
        revenue = time_series_data[time_series_data['Date'] == date]['Revenue_MA7'].iloc[0]
        plt.annotate(event, xy=(date, revenue), xytext=(10, 20),
                    textcoords='offset points', bbox=dict(boxstyle='round,pad=0.3', 
                    facecolor='yellow', alpha=0.7),
                    arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

plt.title('Revenue Timeline with Business Events')
plt.xlabel('Date')
plt.ylabel('Revenue (7-day MA)')
plt.grid(True, alpha=0.3)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=4))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("\n=== TIME SERIES ANALYSIS INSIGHTS ===")
print(f"Total time period: {(end_date - start_date).days} days")
print(f"Average daily revenue: ${time_series_data['Revenue'].mean():,.2f}")
print(f"Revenue growth: {((time_series_data['Revenue'].iloc[-30:].mean() / time_series_data['Revenue'].iloc[:30].mean()) - 1) * 100:.1f}%")
print(f"Best month: {monthly_data.loc[monthly_data['Revenue'].idxmax(), 'Date'].strftime('%Y-%m')}")
print(f"Average ROI: {avg_roi:.2f}")

print("\nKey Time Series Visualization Features:")
print("• Professional date formatting on axes")
print("• Moving averages to show trends")
print("• Dual y-axes for different scales")
print("• Seasonal and periodic analysis")
print("• Event annotations for context")
print("• Multiple time aggregations (daily, monthly, quarterly)")

### Time Series Visualization with Datetime Objects

**Time series analysis** is crucial in business for tracking performance, identifying trends, and forecasting. Proper handling of datetime objects and axis formatting makes your visualizations more professional and easier to interpret.

**Key Concepts:**
- **Datetime objects** - Proper time data representation
- **Time-based aggregation** - Daily, weekly, monthly, quarterly summaries
- **Trend analysis** - Identifying patterns and seasonality
- **Axis formatting** - Professional date labels and tick marks
- **Annotations** - Highlighting important business events

**Common Business Applications:**
- Revenue and sales trends
- Stock price movements
- Seasonal performance analysis
- Marketing campaign effectiveness
- Customer acquisition over time

In [ ]:
# Create a scatter plot of Units_Sold vs Profit_Margin with hue='Product'.
# Count how many different products are represented and assign to 'num_products'.


num_products = business_data['Product'].nunique()
assert num_products == 5, "Should have 5 different products"
print(f"Correct! You created a scatter plot with {num_products} different product categories.")

In [ ]:
# Execute this cell to explore categorical encoding with hue and style

# Create figure with multiple examples of categorical encoding
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Categorical Analysis with Hue and Style', fontsize=16, fontweight='bold')

# 1. Scatter plot with hue (color) by Region
plt.subplot(2, 3, 1)
sns.scatterplot(data=business_data, x='Marketing_Spend', y='Revenue', 
                hue='Region', s=80, alpha=0.7)
plt.title('Revenue vs Marketing Spend\nby Region (Color-coded)')
plt.xlabel('Marketing Spend ($)')
plt.ylabel('Revenue ($)')

# 2. Scatter plot with both hue and style
plt.subplot(2, 3, 2)
sns.scatterplot(data=business_data, x='Customer_Age', y='Customer_Satisfaction',
                hue='Region', style='Product', s=80, alpha=0.7)
plt.title('Customer Analysis\nby Region (Color) and Product (Shape)')
plt.xlabel('Customer Age')
plt.ylabel('Customer Satisfaction')

# 3. Line plot showing trends by quarter and region
quarterly_data = business_data.groupby(['Quarter', 'Region'])['Revenue'].mean().reset_index()
plt.subplot(2, 3, 3)
sns.lineplot(data=quarterly_data, x='Quarter', y='Revenue', 
             hue='Region', marker='o', linewidth=3, markersize=8)
plt.title('Average Revenue Trends\nby Quarter and Region')
plt.xlabel('Quarter')
plt.ylabel('Average Revenue ($)')

# 4. Box plot with custom palette
plt.subplot(2, 3, 4)
sns.boxplot(data=business_data, x='Product', y='Profit_Margin',
            palette='Set3', saturation=0.8)
plt.title('Profit Margin Distribution\nby Product Category')
plt.xlabel('Product Category')
plt.ylabel('Profit Margin')
plt.xticks(rotation=45)

# 5. Violin plot with split by region
plt.subplot(2, 3, 5)
# Create a subset for cleaner visualization
subset_data = business_data[business_data['Region'].isin(['North', 'South'])]
sns.violinplot(data=subset_data, x='Quarter', y='Units_Sold',
               hue='Region', split=True, palette='viridis')
plt.title('Units Sold Distribution\nby Quarter (North vs South)')
plt.xlabel('Quarter')
plt.ylabel('Units Sold')

# 6. Strip plot with size encoding
plt.subplot(2, 3, 6)
sns.stripplot(data=business_data.sample(200), x='Department', y='Revenue',
              hue='Region', size=business_data.sample(200)['Customer_Satisfaction']*3,
              alpha=0.7, jitter=True)
plt.title('Revenue by Department\n(Size = Customer Satisfaction)')
plt.xlabel('Department')
plt.ylabel('Revenue ($)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("=== CREATING BUSINESS PERFORMANCE SEGMENTS ===\n")

# Create performance segments for enhanced analysis
business_data['Revenue_Segment'] = pd.cut(business_data['Revenue'], 
                                        bins=3, labels=['Low', 'Medium', 'High'])

business_data['Profit_Segment'] = pd.cut(business_data['Profit'], 
                                       bins=3, labels=['Low', 'Medium', 'High'])

# Advanced categorical analysis with segments
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Business Performance Segmentation Analysis', fontsize=14, fontweight='bold')

# 1. Revenue segments by product and region
plt.subplot(1, 3, 1)
segment_counts = business_data.groupby(['Product', 'Revenue_Segment']).size().unstack(fill_value=0)
segment_counts.plot(kind='bar', stacked=True, ax=plt.gca(), 
                   colormap='RdYlGn', alpha=0.8)
plt.title('Revenue Segments by Product')
plt.xlabel('Product')
plt.ylabel('Count')
plt.legend(title='Revenue Segment')
plt.xticks(rotation=45)

# 2. Performance matrix heatmap
plt.subplot(1, 3, 2)
performance_matrix = business_data.groupby(['Revenue_Segment', 'Profit_Segment']).size().unstack(fill_value=0)
sns.heatmap(performance_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Performance Matrix\n(Revenue vs Profit Segments)')
plt.xlabel('Profit Segment')
plt.ylabel('Revenue Segment')

# 3. Customer satisfaction by performance segments
plt.subplot(1, 3, 3)
sns.boxplot(data=business_data, x='Revenue_Segment', y='Customer_Satisfaction',
            hue='Profit_Segment', palette='pastel')
plt.title('Customer Satisfaction\nby Performance Segments')
plt.xlabel('Revenue Segment')
plt.ylabel('Customer Satisfaction')

plt.tight_layout()
plt.show()

print("Key Benefits of Categorical Encoding:")
print("• Color (hue) - Distinguish different groups clearly")
print("• Shape (style) - Add another dimension without color confusion")
print("• Size - Encode continuous variables as point/line sizes")
print("• Segments - Create meaningful business categories for analysis")
print("• Multiple dimensions - Analyze 4-5 variables simultaneously")

### Using Hue and Style for Categorical Analysis

One of Seaborn's most powerful features is the ability to **encode additional dimensions** of data using visual properties like color (`hue`), shape (`style`), and size. This is essential for business analysis where you often need to compare performance across different:

- **Regions** or territories
- **Product categories** or lines
- **Time periods** (quarters, years)
- **Customer segments** or demographics
- **Performance levels** (high/medium/low)

**Key Parameters:**
- **`hue`** - Color-code by categorical variable
- **`style`** - Change point/line style by category
- **`size`** - Vary point/line size by numerical variable
- **`palette`** - Control color schemes for professional presentation

In [ ]:
# Create a regression plot showing the relationship between Customer_Age and Revenue.
# Use sns.regplot() and assign the correlation coefficient to 'age_revenue_corr'.


age_revenue_corr = business_data['Customer_Age'].corr(business_data['Revenue'])
assert -1 <= age_revenue_corr <= 1, "Correlation should be between -1 and 1"
print(f"Correct! The correlation between Customer Age and Revenue is {age_revenue_corr:.3f}")

In [ ]:
# Execute this cell to explore regression analysis and pairplots

print("=== REGRESSION ANALYSIS WITH SEABORN ===\n")

# Create regression plots to analyze business relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Business Regression Analysis', fontsize=16, fontweight='bold')

# 1. Revenue vs Marketing Spend with Regression Line
plt.subplot(2, 2, 1)
sns.regplot(data=business_data, x='Marketing_Spend', y='Revenue', 
            scatter_kws={'alpha': 0.6, 's': 50}, line_kws={'color': 'red'})
plt.title('Revenue vs Marketing Spend Regression')
plt.xlabel('Marketing Spend ($)')
plt.ylabel('Revenue ($)')

# 2. Customer Age vs Customer Satisfaction
plt.subplot(2, 2, 2)
sns.regplot(data=business_data, x='Customer_Age', y='Customer_Satisfaction',
            scatter_kws={'alpha': 0.6, 's': 50}, line_kws={'color': 'green'})
plt.title('Customer Age vs Satisfaction')
plt.xlabel('Customer Age')
plt.ylabel('Customer Satisfaction Score')

# 3. Units Sold vs Profit Margin
plt.subplot(2, 2, 3)
sns.regplot(data=business_data, x='Units_Sold', y='Profit_Margin',
            scatter_kws={'alpha': 0.6, 's': 50}, line_kws={'color': 'purple'})
plt.title('Units Sold vs Profit Margin')
plt.xlabel('Units Sold')
plt.ylabel('Profit Margin')

# 4. Marketing Spend vs Profit with different regions
plt.subplot(2, 2, 4)
# Use lmplot equivalent with regplot
for region in business_data['Region'].unique():
    region_data = business_data[business_data['Region'] == region]
    sns.regplot(data=region_data, x='Marketing_Spend', y='Profit', 
                scatter_kws={'alpha': 0.6, 's': 40}, 
                label=region, ax=plt.gca())
plt.title('Marketing Spend vs Profit by Region')
plt.xlabel('Marketing Spend ($)')
plt.ylabel('Profit ($)')
plt.legend()

plt.tight_layout()
plt.show()

print("=== PAIRPLOT FOR COMPREHENSIVE ANALYSIS ===\n")

# Create a subset of key business metrics for pairplot
key_metrics = business_data[['Revenue', 'Marketing_Spend', 'Customer_Satisfaction', 
                           'Profit_Margin', 'Units_Sold', 'Region']].copy()

# Create pairplot with region as the grouping variable
pairplot_fig = sns.pairplot(data=key_metrics, hue='Region', 
                           diag_kind='hist', plot_kws={'alpha': 0.6, 's': 50},
                           height=2.5)
pairplot_fig.fig.suptitle('Business Metrics Pairplot Analysis', 
                         fontsize=14, fontweight='bold', y=1.02)
plt.show()

print("=== CORRELATION HEATMAP ===\n")

# Create correlation matrix for numerical variables
numerical_cols = ['Revenue', 'Marketing_Spend', 'Customer_Satisfaction', 
                 'Profit_Margin', 'Units_Sold', 'Customer_Age', 'Profit']
correlation_matrix = business_data[numerical_cols].corr()

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='RdBu_r', center=0,
            square=True, fmt='.2f', linewidths=0.5)
plt.title('Business Metrics Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Calculate and display key correlations
print("Key Business Correlations:")
print(f"Revenue vs Marketing Spend: {correlation_matrix.loc['Revenue', 'Marketing_Spend']:.3f}")
print(f"Customer Age vs Satisfaction: {correlation_matrix.loc['Customer_Age', 'Customer_Satisfaction']:.3f}")
print(f"Revenue vs Profit: {correlation_matrix.loc['Revenue', 'Profit']:.3f}")
print(f"Units Sold vs Revenue: {correlation_matrix.loc['Units_Sold', 'Revenue']:.3f}")

print("\nInsights from Analysis:")
print("• Regression plots show trend lines with confidence intervals")
print("• Pairplots reveal relationships between multiple variables simultaneously")
print("• Correlation heatmaps quantify the strength of relationships")
print("• These tools help identify business drivers and optimization opportunities")

### Regression Analysis and Pairplots

Two of Seaborn's most powerful features for business analysis are **regression plots** and **pairplots**:

**Regression Plots (`sns.regplot()`):**
- Show relationships between variables with trend lines
- Include confidence intervals for predictions
- Ideal for analyzing correlations in business metrics
- Help identify potential causations and outliers

**Pairplots (`sns.pairplot()`):**
- Display relationships between multiple variables simultaneously
- Show distributions on the diagonal
- Use color coding to distinguish groups
- Perfect for exploratory data analysis
- Reveal hidden patterns in business data

These tools are invaluable for understanding complex business relationships and making data-driven decisions.

In [ ]:
# Execute this cell to explore Seaborn's statistical visualization capabilities

# Set up a professional seaborn style
sns.set_style("whitegrid")
sns.set_palette("Set2")

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Seaborn for Business Analytics', fontsize=16, fontweight='bold')

# 1. Enhanced Scatter Plot with Grouping by Region
plt.subplot(2, 3, 1)
sns.scatterplot(data=business_data, x='Marketing_Spend', y='Revenue', 
                hue='Region', style='Region', s=80, alpha=0.7)
plt.title('Revenue vs Marketing Spend by Region')
plt.xlabel('Marketing Spend ($)')
plt.ylabel('Revenue ($)')

# 2. Statistical Bar Plot with Error Bars
plt.subplot(2, 3, 2)
sns.barplot(data=business_data, x='Product', y='Revenue', 
            errorbar='ci', capsize=0.1, alpha=0.8)
plt.title('Average Revenue by Product\n(with 95% Confidence Intervals)')
plt.xlabel('Product Category')
plt.ylabel('Average Revenue ($)')
plt.xticks(rotation=45)

# 3. Box Plot with Outlier Analysis
plt.subplot(2, 3, 3)
sns.boxplot(data=business_data, x='Region', y='Customer_Satisfaction', 
            palette='viridis', showfliers=True)
plt.title('Customer Satisfaction Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Customer Satisfaction Score')

# 4. Distribution Plot with Density
plt.subplot(2, 3, 4)
sns.histplot(data=business_data, x='Profit_Margin', kde=True, 
             bins=25, alpha=0.7, color='skyblue')
plt.title('Distribution of Profit Margins\n(with Density Curve)')
plt.xlabel('Profit Margin')
plt.ylabel('Frequency')

# 5. Violin Plot - Distribution Shape Analysis
plt.subplot(2, 3, 5)
sns.violinplot(data=business_data, x='Quarter', y='Units_Sold', 
               palette='muted', inner='quart')
plt.title('Units Sold Distribution by Quarter')
plt.xlabel('Quarter')
plt.ylabel('Units Sold')

# 6. Count Plot for Categorical Analysis
plt.subplot(2, 3, 6)
sns.countplot(data=business_data, x='Department', 
              order=business_data['Department'].value_counts().index,
              palette='pastel')
plt.title('Number of Records by Department')
plt.xlabel('Department')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("Seaborn Advantages for Business Analysis:")
print("• Automatic statistical summaries (error bars, confidence intervals)")
print("• Easy categorical data handling with 'hue' parameter")
print("• Professional themes suitable for presentations")
print("• Built-in statistical tests and visualizations")
print("• Seamless integration with pandas DataFrames")

### Seaborn for Statistical Visualization

**Seaborn** builds on matplotlib to provide beautiful, statistical visualizations with minimal code. It's particularly powerful for business analysis because it:

- **Handles categorical data** automatically
- **Provides statistical summaries** built into plots
- **Uses attractive default themes** suitable for presentations
- **Integrates seamlessly** with pandas DataFrames
- **Offers specialized plots** for correlation and regression analysis

**Key Seaborn Plot Types:**
- **`sns.scatterplot()`** - Enhanced scatter plots with grouping
- **`sns.lineplot()`** - Time series and trend analysis
- **`sns.barplot()`** - Statistical bar plots with error bars
- **`sns.boxplot()`** - Distribution comparison across groups
- **`sns.histplot()`** - Modern histograms and density plots
- **`sns.heatmap()`** - Correlation matrices and pivot tables
- **`sns.regplot()`** - Regression analysis with confidence intervals
- **`sns.pairplot()`** - Multi-variable relationship exploration

In [ ]:
# Create a simple bar plot showing total revenue by product category.
# Use the business_data DataFrame and assign the result to 'product_revenue'.


# Verify the exercise
assert len(product_revenue) == 5, "Should have 5 product categories"
assert product_revenue.sum() > 0, "Should have positive revenue values"
print("Correct! You created a bar plot of revenue by product category.")

In [ ]:
# Execute this cell to see basic plot types in action

# Create a figure with multiple subplots to show different plot types
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Essential Business Plot Types', fontsize=16, fontweight='bold')

# 1. Line Plot - Revenue trends over time
monthly_revenue = business_data.groupby('Month')['Revenue'].mean()
axes[0, 0].plot(monthly_revenue.index, monthly_revenue.values, marker='o', linewidth=2, markersize=6)
axes[0, 0].set_title('Average Monthly Revenue Trend')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Average Revenue ($)')
axes[0, 0].grid(True, alpha=0.3)

# 2. Bar Plot - Revenue by Region
regional_revenue = business_data.groupby('Region')['Revenue'].sum()
bars = axes[0, 1].bar(regional_revenue.index, regional_revenue.values, color='steelblue', alpha=0.7)
axes[0, 1].set_title('Total Revenue by Region')
axes[0, 1].set_xlabel('Region')
axes[0, 1].set_ylabel('Total Revenue ($)')
# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    axes[0, 1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                    f'${height:,.0f}', ha='center', va='bottom', fontsize=10)

# 3. Scatter Plot - Revenue vs Marketing Spend
axes[0, 2].scatter(business_data['Marketing_Spend'], business_data['Revenue'], 
                   alpha=0.6, color='coral', s=50)
axes[0, 2].set_title('Revenue vs Marketing Spend')
axes[0, 2].set_xlabel('Marketing Spend ($)')
axes[0, 2].set_ylabel('Revenue ($)')
axes[0, 2].grid(True, alpha=0.3)

# 4. Histogram - Distribution of Customer Ages
axes[1, 0].hist(business_data['Customer_Age'], bins=20, color='lightgreen', 
                alpha=0.7, edgecolor='black', linewidth=0.5)
axes[1, 0].set_title('Distribution of Customer Ages')
axes[1, 0].set_xlabel('Customer Age')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True, alpha=0.3)

# 5. Box Plot - Profit Margin by Product
product_order = business_data.groupby('Product')['Profit_Margin'].median().sort_values(ascending=False).index
box_data = [business_data[business_data['Product'] == product]['Profit_Margin'] for product in product_order]
bp = axes[1, 1].boxplot(box_data, labels=product_order, patch_artist=True)
axes[1, 1].set_title('Profit Margin Distribution by Product')
axes[1, 1].set_xlabel('Product')
axes[1, 1].set_ylabel('Profit Margin')
axes[1, 1].tick_params(axis='x', rotation=45)
# Color the boxes
colors = ['lightblue', 'lightcoral', 'lightgreen', 'lightyellow', 'lightpink']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

# 6. Horizontal Bar Plot - Customer Satisfaction by Department
dept_satisfaction = business_data.groupby('Department')['Customer_Satisfaction'].mean().sort_values()
axes[1, 2].barh(dept_satisfaction.index, dept_satisfaction.values, color='purple', alpha=0.6)
axes[1, 2].set_title('Average Customer Satisfaction by Department')
axes[1, 2].set_xlabel('Average Customer Satisfaction')
axes[1, 2].set_ylabel('Department')
# Add value labels
for i, v in enumerate(dept_satisfaction.values):
    axes[1, 2].text(v + 0.05, i, f'{v:.2f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

print("Key Insights from the plots:")
print("1. Line Plot: Shows seasonal or temporal patterns in revenue")
print("2. Bar Plot: Easily compare performance across categories")
print("3. Scatter Plot: Reveals relationships between business metrics")
print("4. Histogram: Understand customer demographics and distributions")
print("5. Box Plot: Compare distributions and identify outliers")
print("6. Horizontal Bar Plot: Good for comparing many categories or long labels")

### Basic Plot Types for Business Analysis

Let's start with the fundamental plot types that every business analyst should know. Each plot type serves specific purposes in business analysis and decision-making.

**When to Use Each Plot Type:**
- **Line plots**: Time series data, trends, performance over time
- **Bar plots**: Comparing categories, showing rankings, displaying counts
- **Scatter plots**: Relationships between variables, correlation analysis
- **Histograms**: Distribution analysis, understanding data spread
- **Box plots**: Comparing groups, identifying outliers, quartile analysis

<div style="background-image: url('https://www.dropbox.com/scl/fi/wdrnuojbnjx6lgfekrx85/mcnair.jpg?rlkey=wcbaw5au7vh5vt1g5d5x7fw8f&dl=1'); background-size: cover; background-position: center; height: 300px; display: flex; align-items: center; justify-content: center; color: white; text-shadow: 2px 2px 4px rgba(0,0,0,0.7); margin-bottom: 20px; position: relative;">
  <h1 style="text-align: center; font-size: 2.5em; margin: 0;">JGSB Python Workshop <br> Part 8: Visualization</h1>
  <div style="position: absolute; bottom: 10px; left: 15px; font-size: 0.9em; color: white; text-shadow: 2px 2px 4px rgba(0,0,0,0.7);">
    Authored by Kerry Back
  </div>
  <div style="position: absolute; bottom: 10px; right: 15px; text-align: right; font-size: 0.9em; color: white; text-shadow: 2px 2px 4px rgba(0,0,0,0.7);">
    Rice University, 9/6/2025
  </div>
</div>

### Introduction to Data Visualization

**Data visualization** is the graphical representation of information and data. For business professionals, effective visualization is crucial for:

- **Decision Making**: Quickly identify trends, patterns, and outliers
- **Communication**: Present findings clearly to stakeholders and clients
- **Analysis**: Explore relationships between business variables
- **Monitoring**: Track KPIs and performance metrics over time
- **Reporting**: Create professional charts for presentations and reports

**Why Python for Business Visualization?**
- **Matplotlib**: The foundation library providing complete control over plot elements
- **Seaborn**: Statistical visualization with beautiful defaults and business-friendly themes
- **Integration**: Works seamlessly with pandas for data analysis
- **Customization**: Professional styling for presentations and reports
- **Automation**: Generate reports and dashboards programmatically

**Key Libraries:**
```python
import matplotlib.pyplot as plt  # Core plotting
import seaborn as sns           # Statistical plots
import pandas as pd            # Data manipulation
import numpy as np            # Numerical operations
```

**This Workshop Covers:**
- Basic plot types for business analysis
- Professional styling and customization
- Time series visualization for financial data
- Comparative analysis and dashboards
- Advanced statistical plots
- Best practices for business presentations

In [ ]:
# Execute this cell to import all necessary libraries and set up the environment

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from datetime import datetime, timedelta

# Set up matplotlib and seaborn for better looking plots
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure plot settings for notebooks
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

print("Libraries imported successfully!")
print("Matplotlib version:", plt.__version__)
print("Seaborn version:", sns.__version__)
print("Pandas version:", pd.__version__)

# Create sample business dataset for our examples
np.random.seed(42)  # For reproducible results

# Generate sample business data
dates = pd.date_range('2023-01-01', '2024-12-31', freq='D')
regions = ['North', 'South', 'East', 'West']
products = ['Laptop', 'Desktop', 'Tablet', 'Phone', 'Accessories']
departments = ['Sales', 'Marketing', 'Engineering', 'HR', 'Finance']

# Create comprehensive business dataset
business_data = pd.DataFrame({
    'Date': np.random.choice(dates, 1000),
    'Region': np.random.choice(regions, 1000),
    'Product': np.random.choice(products, 1000),
    'Department': np.random.choice(departments, 1000),
    'Revenue': np.random.normal(10000, 3000, 1000).clip(1000, 50000),
    'Units_Sold': np.random.poisson(50, 1000),
    'Customer_Age': np.random.normal(35, 12, 1000).clip(18, 80),
    'Customer_Satisfaction': np.random.normal(4.0, 0.8, 1000).clip(1, 5),
    'Marketing_Spend': np.random.exponential(2000, 1000),
    'Profit_Margin': np.random.normal(0.25, 0.1, 1000).clip(0.05, 0.6)
})

# Add some calculated fields
business_data['Profit'] = business_data['Revenue'] * business_data['Profit_Margin']
business_data['Month'] = business_data['Date'].dt.month
business_data['Quarter'] = business_data['Date'].dt.quarter
business_data['Weekday'] = business_data['Date'].dt.day_name()

print("\nSample business dataset created with", len(business_data), "records")
print("\nFirst few rows:")
print(business_data.head())