In [0]:
# Run the SQL query and load results into a Spark DataFrame
df = spark.sql("""
SELECT 
    d.year,
    d.month,
    ROUND(SUM(fs.sales_amount), 2) as monthly_sales,
    ROUND(SUM(fs.unit_profit), 2) as monthly_profit,
    COUNT(DISTINCT fs.order_id) as monthly_orders
FROM jaffle_shop_retail.gold.fact_sales fs
JOIN jaffle_shop_retail.gold.dim_date d 
    ON fs.date_key = d.date_key
WHERE d.date BETWEEN DATE '2021-01-01' AND DATE '2022-01-01'
GROUP BY d.year, d.month
ORDER BY d.year, d.month
""")

# Convert to Pandas DataFrame for visualization
pdf = df.toPandas()

# Create a 'year_month' column for the x-axis
pdf['year_month'] = (
    pdf['year'].astype(str) + '-' + pdf['month'].astype(str).str.zfill(2)
)

import plotly.graph_objects as go

x = pdf['year_month']

fig = go.Figure()
fig.add_bar(
    x=x,
    y=pdf['monthly_sales'],
    name='Monthly Sales',
    marker_color='steelblue',
    text=pdf['monthly_sales'],
    textposition='outside'
)
fig.add_bar(
    x=x,
    y=pdf['monthly_profit'],
    name='Monthly Profit',
    marker_color='mediumseagreen',
    text=pdf['monthly_profit'],
    textposition='outside'
)

fig.update_layout(
    barmode='group',
    xaxis_tickangle=-45,
    xaxis_title='Year-Month',
    yaxis_title='Amount',
    title='Monthly Sales and Profit (2020-2021)',
    legend_title='Metric',
    height=500,
    width=1000
)

fig.show()

In [0]:
# Run the SQL query and load results into a Spark DataFrame
df = spark.sql("""
SELECT 
    d.day_type,
    COUNT(DISTINCT fs.order_id) AS total_orders,
    COUNT(DISTINCT fs.customer_id) AS unique_customers,
    AVG(fs.sales_amount) AS avg_order_value,
    SUM(fs.sales_amount) AS total_revenue
FROM jaffle_shop_retail.gold.fact_sales fs
JOIN jaffle_shop_retail.gold.dim_date d ON fs.date_key = d.date_key
GROUP BY d.day_type
ORDER BY d.day_type
""")

# Convert to Pandas DataFrame for visualization
pdf = df.toPandas()

import plotly.graph_objects as go

fig = go.Figure(
    go.Pie(
        labels=pdf['day_type'],
        values=pdf['total_revenue'],
        hoverinfo='label+percent+value',
        textinfo='percent+label',
        hole=0.3
    )
)
fig.update_layout(
    title='Total Revenue Distribution by Day Type',
    height=600,
    width=800
)
fig.show()

In [0]:
# Run the SQL query and load results into a Spark DataFrame
df = spark.sql("""
SELECT 
    d.day_type,
    COUNT(DISTINCT fs.order_id) AS total_orders,
    COUNT(DISTINCT fs.customer_id) AS unique_customers,
    (COUNT(DISTINCT fs.order_id) * 1.0 / COUNT(DISTINCT fs.customer_id)) AS avg_orders_per_customer
FROM jaffle_shop_retail.gold.fact_sales fs
JOIN jaffle_shop_retail.gold.dim_date d ON fs.date_key = d.date_key
GROUP BY d.day_type
ORDER BY d.day_type
""")

# Convert to Pandas DataFrame for visualization
pdf = df.toPandas()

import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display as ipy_display

def get_colors(bar_color):
    # Use a different color for 'Weekend'
    return [
        'orange' if dt.lower() == 'weekend' else bar_color
        for dt in pdf['day_type']
    ]

def update_chart(bar_color):
    plt.figure(figsize=(8, 6))
    colors = get_colors(bar_color)
    bars = plt.bar(
        pdf['day_type'],
        pdf['avg_orders_per_customer'],
        color=colors
    )
    for i, v in enumerate(pdf['avg_orders_per_customer']):
        plt.text(i, v, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')
    plt.xlabel('Day Type')
    plt.ylabel('Average Orders per Customer')
    plt.title('Average Orders per Customer by Day Type')
    plt.tight_layout()
    plt.show()

color_picker = widgets.ColorPicker(
    value='steelblue',
    description='Bar Color:',
    disabled=True
)

out = widgets.Output()

def on_color_change(change):
    with out:
        out.clear_output(wait=True)
        update_chart(change['new'])

color_picker.observe(on_color_change, names='value')

ipy_display(color_picker)
with out:
    update_chart(color_picker.value)
ipy_display(out)

In [0]:
# Calculate average lifetime value (LTV) by customer segment
df = spark.sql("""
SELECT 
    c.customer_segment AS tier,
    AVG(fs.sales_amount) AS avg_lifetime_value
FROM jaffle_shop_retail.gold.fact_sales fs
JOIN jaffle_shop_retail.gold.dim_customers c ON fs.customer_id = c.customer_id
GROUP BY c.customer_segment
ORDER BY c.customer_segment
""")

pdf = df.toPandas()

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(
    pdf['tier'],
    pdf['avg_lifetime_value'],
    color='mediumseagreen'
)
for i, v in enumerate(pdf['avg_lifetime_value']):
    plt.text(v, i, f'{v:.2f}', va='center', ha='left', fontweight='bold')
plt.xlabel('Average Lifetime Value')
plt.ylabel('Customer Tier')
plt.title('Average Lifetime Value by Customer Tier')
plt.tight_layout()
plt.show()

In [0]:
# Sales distribution by quarter for a single year (e.g., 2021)

import ipywidgets as widgets
from IPython.display import display as ipy_display
import matplotlib.pyplot as plt

def plot_quarter_pie(selected_year):
    df_quarter = spark.sql(f"""
    SELECT 
        d.year,
        d.quarter,
        SUM(fs.sales_amount) AS total_sales
    FROM jaffle_shop_retail.gold.fact_sales fs
    JOIN jaffle_shop_retail.gold.dim_date d 
        ON fs.date_key = d.date_key
    WHERE d.year = {selected_year}
    GROUP BY d.year, d.quarter
    ORDER BY d.quarter
    """)
    pdf_quarter = df_quarter.toPandas()
    pdf_quarter['year_quarter'] = pdf_quarter['year'].astype(str) + '-Q' + pdf_quarter['quarter'].astype(str)
    pdf_pie = pdf_quarter.groupby('year_quarter')['total_sales'].sum().reset_index()
    plt.figure(figsize=(8, 6))
    plt.pie(
        pdf_pie['total_sales'],
        labels=pdf_pie['year_quarter'],
        autopct='%1.1f%%',
        startangle=140
    )
    plt.title(f'Sales Distribution by Quarter ({selected_year})')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

year_picker = widgets.IntSlider(
    value=2021,
    min=2020,
    max=2025,
    step=1,
    description='Year:',
    continuous_update=False
)

out = widgets.Output()

def on_year_change(change):
    with out:
        out.clear_output(wait=True)
        plot_quarter_pie(change['new'])

year_picker.observe(on_year_change, names='value')

ipy_display(year_picker)
with out:
    plot_quarter_pie(year_picker.value)
ipy_display(out)

In [0]:
# Sales performance heatmap: store vs product type

df = spark.sql("""
SELECT 
    s.store_name,
    p.product_type,
    SUM(fs.sales_amount) AS total_sales
FROM jaffle_shop_retail.gold.fact_sales fs
JOIN jaffle_shop_retail.gold.dim_stores s 
    ON fs.store_key = s.store_key
JOIN jaffle_shop_retail.gold.dim_products p 
    ON fs.product_key = p.product_id
GROUP BY s.store_name, p.product_type
""")

pdf = df.toPandas()

import seaborn as sns
import matplotlib.pyplot as plt

pivot = pdf.pivot(
    index='store_name', 
    columns='product_type', 
    values='total_sales'
)

plt.figure(figsize=(12, 8))
sns.heatmap(pivot, annot=True, fmt=".0f", cmap="Reds")
plt.title('Sales Performance: Store vs Product Type')
plt.xlabel('Product Type')
plt.ylabel('Store Name')
plt.tight_layout()
plt.show()

In [0]:
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display as ipy_display

def format_dollar(value):
    abs_val = abs(value)
    if abs_val >= 1_000_000_000:
        return f"${value/1_000_000_000:.2f}B"
    elif abs_val >= 1_000_000:
        return f"${value/1_000_000:.2f}M"
    elif abs_val >= 1_000:
        return f"${value/1_000:.2f}K"
    else:
        return f"${value:,.2f}"

# Aggregate metrics from fact_sales using SQL
df_metrics = spark.sql(
    """
    SELECT
        COUNT(DISTINCT customer_id) AS total_customers,
        COUNT(DISTINCT product_key) AS total_products,
        COUNT(DISTINCT order_id) AS total_orders,
        ROUND(SUM(sales_amount), 2) AS total_sales,
        ROUND(SUM(unit_profit), 2) AS total_profit
    FROM jaffle_shop_retail.gold.fact_sales
    """
)

metrics = df_metrics.toPandas().iloc[0]

key_cards = [
    {"label": "Total Customers", "value": int(metrics['total_customers']), "color": "#4F8EF7", "is_dollar": False},
    {"label": "Total Orders", "value": int(metrics['total_orders']), "color": "#43AA8B", "is_dollar": False},
    {"label": "Total Sales", "value": metrics['total_sales'], "color": "#F76E4F", "is_dollar": True},
    {"label": "Total Profit", "value": metrics['total_profit'], "color": "#FFD166", "is_dollar": True},
    {"label": "Total Products", "value": int(metrics['total_products']), "color": "#8D5CF6", "is_dollar": False},
]

def plot_key_cards(fontsize=18):
    plt.figure(figsize=(2 * len(key_cards), 2))
    for i, card in enumerate(key_cards):
        plt.gca().add_patch(
            plt.Rectangle((i, 0), 1, 1, color=card["color"], alpha=0.15, ec=card["color"], lw=2)
        )
        display_value = (
            format_dollar(card['value']) if card.get("is_dollar", False) else f"{card['value']:,}"
        )
        plt.text(
            i + 0.5, 0.65, display_value,
            ha='center', va='center', fontsize=fontsize+4,
            fontweight='bold', color=card["color"]
        )
        plt.text(
            i + 0.5, 0.35, card['label'],
            ha='center', va='center', fontsize=fontsize, color='black'
        )
    plt.axis('off')
    plt.xlim(0, len(key_cards))
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()

fontsize_slider = widgets.IntSlider(
    value=18,
    min=6,
    max=20,
    step=1,
    description='Font Size:',
    continuous_update=False
)

out = widgets.Output()

def on_fontsize_change(change):
    with out:
        out.clear_output(wait=True)
        plot_key_cards(change['new'])

fontsize_slider.observe(on_fontsize_change, names='value')

ipy_display(fontsize_slider)
with out:
    plot_key_cards(fontsize_slider.value)
ipy_display(out)