#### **Building Forecast-Ready Dataset with Weather & Metadata**

In [None]:
import pandas as pd
import os
import random, numpy as np, os
import json
import joblib

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 3, Finished, Available, Finished)

In [None]:
# Set seed
random.seed(42)
np.random.seed(42)
os.environ['PYTHONHASHSEED'] = '42'

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 4, Finished, Available, Finished)

In [None]:
# Load data into pandas DataFrame 
imputed_fact_live_times = spark.read.table("imputed_fact_live_times").toPandas()

# Ensure Datetime is set as index
if not isinstance(imputed_fact_live_times.index, pd.DatetimeIndex):
    if 'Datetime' in imputed_fact_live_times.columns:
        imputed_fact_live_times = imputed_fact_live_times.set_index('Datetime').sort_index()
        print("Datetime column set as index.")
    else:
        raise ValueError("The dataframe does not have a 'Datetime' column to set as index.")
else:
    print("Datetime index already set.")

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 5, Finished, Available, Finished)

Datetime column set as index.


In [None]:
weather_forecast_df = (
    spark.read.table("stg_weather_forecast")
    .select("Datetime", "Temp", "IsRaining", "IsSnowing")
    .toPandas()
)

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 6, Finished, Available, Finished)

In [None]:
# Get the latest datetime from training data index
max_train_time = imputed_fact_live_times.index.max()

# Ensure Datetime is datetime type
weather_forecast_df['Datetime'] = pd.to_datetime(weather_forecast_df['Datetime'])

# Filter to park operational hours
weather_forecast_df['Hour'] = weather_forecast_df['Datetime'].dt.hour
weather_forecast_df = weather_forecast_df[
    (weather_forecast_df['Hour'] >= 6) & (weather_forecast_df['Hour'] <= 22)
]

# Filter to only include hours after training data
weather_forecast_df = weather_forecast_df[weather_forecast_df['Datetime'] > max_train_time]


StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 7, Finished, Available, Finished)

In [None]:
print(weather_forecast_df['Datetime'].max())
print(weather_forecast_df['Datetime'].min())

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 8, Finished, Available, Finished)

2025-05-22 22:00:00
2025-05-16 06:00:00


In [None]:
# Directory where models are stored
model_dir = "/lakehouse/default/Files/models/live_times"

# Discover which parks you have models for ───
model_files = [fn for fn in os.listdir(model_dir) if fn.endswith(".joblib")]
trained_parks = [
    os.path.splitext(fn)[0].replace("_", " ")
    for fn in model_files
]

# Create metadata dataframe
park_meta_df = pd.DataFrame({"ParkName": trained_parks})
park_meta_df["key"] = 1
weather_forecast_df["key"] = 1

# Cross join
forecast_df = pd.merge(weather_forecast_df, park_meta_df, on="key").drop(columns="key")
forecast_df = forecast_df.sort_values(by="Datetime").reset_index(drop=True)
forecast_df.head()

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 9, Finished, Available, Finished)

Unnamed: 0,Datetime,Temp,IsRaining,IsSnowing,Hour,ParkName
0,2025-05-16 06:00:00,8.6,1,0,6,sθәqәlxenәm ts'exwts'áxwi7 (Rainbow)
1,2025-05-16 06:00:00,8.6,1,0,6,Harbour Green Park
2,2025-05-16 06:00:00,8.6,1,0,6,Guelph Park
3,2025-05-16 06:00:00,8.6,1,0,6,Grandview Park
4,2025-05-16 06:00:00,8.6,1,0,6,General Brock Park


Join with Park attributes

In [None]:
# Join forecast_df with dim_park_attributes on ParkName
dim_park_attributes = (
    spark.read.table("dim_park_attributes")
    .select("ParkName", "ParkKey","PlaceID")
    .toPandas()
)

# Join to get ParkKey into forecast_df
forecast_df = forecast_df.merge(dim_park_attributes, on="ParkName", how="left")

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 10, Finished, Available, Finished)

Join with dim_date

In [None]:
# Join forecast_df with dim_date  
forecast_df['Date'] = forecast_df['Datetime'].dt.date  

dim_date = (
    spark.read.table("dim_date")
    .select("Date", "DayOfWeek", "IsWeekend", "IsHoliday")
    .toPandas()
)

dim_date['Date'] = pd.to_datetime(dim_date['Date']).dt.date

forecast_df = forecast_df.merge(dim_date, on="Date", how="left")

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 11, Finished, Available, Finished)

Join with dim_event

In [None]:
# Join forecast_df with dim_event
dim_events = (
    spark.read.table("dim_events")
    .select("EventKey", "PlaceID", "StartTime", "EndTime")
    .toPandas()
)

dim_events['StartTime'] = pd.to_datetime(dim_events['StartTime'])
dim_events['EndTime'] = pd.to_datetime(dim_events['EndTime'])

# Step 1: Inner join with events where Datetime is within Start–End window
matched = forecast_df.merge(dim_events, on="PlaceID", how="left")
matched = matched[
    (matched['Datetime'] >= matched['StartTime']) &
    (matched['Datetime'] <= matched['EndTime'])
]

# Step 2: Create a set of rows that had an event
matched_set = matched[['Datetime', 'ParkName', 'PlaceID']].drop_duplicates()
matched_set['HasEvent'] = 1

# Step 3: Merge HasEvent back into forecast_df (default = 0 if no match)
forecast_df = forecast_df.merge(
    matched_set,
    on=['Datetime', 'ParkName', 'PlaceID'],
    how='left'
)

forecast_df['HasEvent'] = forecast_df['HasEvent'].fillna(0).astype(int)

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 12, Finished, Available, Finished)

#### **Feature Engineering Forecast-Ready Dataset**

In [None]:
forecast_df.info()

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 13, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7378 entries, 0 to 7377
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Datetime   7378 non-null   datetime64[us]
 1   Temp       7378 non-null   float64       
 2   IsRaining  7378 non-null   int32         
 3   IsSnowing  7378 non-null   int32         
 4   Hour       7378 non-null   int32         
 5   ParkName   7378 non-null   object        
 6   ParkKey    7378 non-null   int64         
 7   PlaceID    7378 non-null   object        
 8   Date       7378 non-null   object        
 9   DayOfWeek  7378 non-null   int64         
 10  IsWeekend  7378 non-null   int64         
 11  IsHoliday  7378 non-null   int32         
 12  HasEvent   7378 non-null   int64         
dtypes: datetime64[us](1), float64(1), int32(4), int64(4), object(3)
memory usage: 634.2+ KB


In [None]:
forecast_df['Datetime'].max()

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 14, Finished, Available, Finished)

Timestamp('2025-05-22 22:00:00')

In [None]:
import joblib
from datetime import timedelta

# Set forecast start and 7-day window
forecast_df = forecast_df.set_index('Datetime')  
forecast_start = forecast_df.index.min().normalize()
start_window, end_window = forecast_start - timedelta(days=7), forecast_start

# Expected rows: number of hours × number of days in window
window_df = imputed_fact_live_times.loc[(imputed_fact_live_times.index >= start_window) & 
                                        (imputed_fact_live_times.index < end_window)]
expected_rows = window_df.index.hour.nunique() * window_df.index.normalize().nunique()

# Build park_last_known_dict using loaded models
park_last_known_dict = {}
warnings = []

for park_name in trained_parks:
    # Construct model path and load model
    filename = park_name.replace(" ", "_").replace("/", "_") + ".joblib"
    model_path = os.path.join(model_dir, filename)
    
    try:
        model = joblib.load(model_path)
    except Exception as e:
        warnings.append(f"{park_name}: failed to load model ({e})")
        continue

    # Get last 7-day historical data for this park
    df_park = imputed_fact_live_times[imputed_fact_live_times['ParkName'] == park_name].copy()
    last_7day = df_park[(df_park.index >= start_window) & (df_park.index < end_window)].sort_index()

    # Store in dictionary
    park_last_known_dict[park_name] = {
        'model': model,
        'last_known': last_7day,
    }

    # Check row completeness
    n = len(last_7day)
    if n != expected_rows:
        note = "no data" if n == 0 else f"{abs(n - expected_rows)} {'missing' if n < expected_rows else 'extra'} rows"
        warnings.append(f"{park_name}: {n} rows (expected {expected_rows}) → {note}")

# Summary output
print(f"\nStored {len(park_last_known_dict)} parks | Expected rows: {expected_rows}")
if warnings:
    print("\n Unusual parks:\n - " + "\n - ".join(warnings))
else:
    print("All parks have complete 7-day historical data.")


StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 15, Finished, Available, Finished)


Stored 62 parks | Expected rows: 119

 Unusual parks:
 - Clinton Park: 118 rows (expected 119) → 1 missing rows
 - Hastings Park - Italian Garden: 0 rows (expected 119) → no data


In [None]:
import numpy as np
from math import pi

target_col = "PopularTimesLivePercent"
park_col = "ParkName"

merged_forecast_dict = {}

for park in park_last_known_dict.keys():
    # Get last 7-day historical data
    last_known = park_last_known_dict[park]['last_known'].copy()

    # Get forecast data for this park
    future_input = forecast_df[forecast_df[park_col] == park].copy()

    # Ensure index is datetime
    future_input.index = pd.to_datetime(future_input.index)
    
    # Insert placeholder target column
    future_input[target_col] = np.nan

    # Combine historical + future
    combined = pd.concat([last_known, future_input]).sort_index()
    combined.index = pd.to_datetime(combined.index)

    # Identify forecast portion
    forecast_mask = combined[target_col].isna()

    # Feature engineering (only on forecast rows)
    combined.loc[forecast_mask, 'Hour'] = combined.loc[forecast_mask].index.hour
    combined.loc[forecast_mask, 'HourOp'] = (combined.loc[forecast_mask, 'Hour'] - 6) % 17
    combined.loc[forecast_mask, 'DayOfWeek'] = combined.loc[forecast_mask].index.dayofweek
    combined.loc[forecast_mask, 'HourSin'] = np.sin(2 * pi * combined.loc[forecast_mask, 'HourOp'] / 17)
    combined.loc[forecast_mask, 'HourCos'] = np.cos(2 * pi * combined.loc[forecast_mask, 'HourOp'] / 17)

    # Create lag features
    for lag in [1, 2, 3, 17]:
        combined[f'Lag_{lag}'] = combined[target_col].shift(lag)

    # Handle unreliable lag at 6AM using 6AM averages by day of week
    historical_data = combined[~forecast_mask].copy()
    historical_data['DayOfWeek'] = historical_data.index.dayofweek

    avg_6am_by_dow = (
        historical_data[historical_data.index.hour == 6]
        .groupby('DayOfWeek')[target_col]
        .mean()
    )

    forecast_6am_mask = forecast_mask & (combined.index.hour == 6)
    dow_values = combined.loc[forecast_6am_mask].index.dayofweek.to_series(index=combined.loc[forecast_6am_mask].index)

    for lag in [1, 2, 3, 17]:
        combined.loc[forecast_6am_mask, f'Lag_{lag}'] = dow_values.map(avg_6am_by_dow)

    # Store result
    merged_forecast_dict[park] = combined
    print(f"{park}: {len(combined)} total rows")

print(f"\n Ready for prediction: {len(merged_forecast_dict)} parks")


StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, 16, Finished, Available, Finished)

Andy Livingstone Park: 238 total rows
Arbutus Greenway Park: 238 total rows
Beaconsfield Park: 238 total rows
Bobolink Park: 238 total rows
CRAB Park at Portside: 238 total rows
Chaldecott Park: 238 total rows
Champlain Heights Park: 238 total rows
Charleson Park: 238 total rows
China Creek North Park: 238 total rows
China Creek South Park: 238 total rows
Clark Park: 238 total rows
Clinton Park: 237 total rows
Coal Harbour Park: 238 total rows
Columbia Park: 238 total rows
Connaught Park: 238 total rows
Coopers' Park: 238 total rows
Creekside Park: 238 total rows
David Lam Park: 238 total rows
Emery Barnes Park: 238 total rows
Empire Fields - Hastings Park: 238 total rows
English Bay Beach Park: 238 total rows
Everett Crowley Park: 238 total rows
Falaise Park: 238 total rows
Fraser River Park: 238 total rows
General Brock Park: 238 total rows
Grandview Park: 238 total rows
Guelph Park: 238 total rows
Harbour Green Park: 238 total rows
Hastings Park - Italian Garden: 119 total rows
Hill

----------------------------

In [None]:
# Prediction Pipeline - GitHub PlotURL 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import seaborn as sns
import os
import base64
import requests
from datetime import datetime

def create_forecast(merged_forecast_dict, park_last_known_dict, save_to_lakehouse=True):
    """
    Create clean and elegant forecast visualization with 17-hour operational granularity (6am-10pm)
    Includes auto-save functionality to lakehouse
    """
    
    # AUTO-SAVE SETUP
    if save_to_lakehouse:
        today_str = datetime.now().strftime("%Y%m%d")
        save_dir = f"/lakehouse/default/Files/forecast/{today_str}"
        os.makedirs(save_dir, exist_ok=True)
        print(f"Auto-save enabled: {save_dir}")
    
    # Define the 15 features your model expects
    feature_cols = [
        'Temp', 'IsRaining', 'IsSnowing', 'IsHoliday', 'IsWeekend', 'HasEvent',
        'HourOp', 'DayOfWeek', 'HourSin', 'HourCos', 
        'Lag_1', 'Lag_2', 'Lag_3', 'Lag_17',
    ]
    
    results = {}
    
    # STEP 1: Generate Predictions
    print("Generating predictions...")
    
    for park_name in merged_forecast_dict.keys():
        df = merged_forecast_dict[park_name].copy()
        model = park_last_known_dict[park_name]['model']
        
        future_mask = df['PopularTimesLivePercent'].isna()
        future_indices = df[future_mask].index.sort_values()
        
        # Make predictions iteratively
        for i, idx in enumerate(future_indices):
            #X = df.loc[idx:idx, feature_cols]
            X = df.loc[[idx], feature_cols]
            prediction = np.clip(model.predict(X)[0], 0, 100)
            df.loc[idx, 'PopularTimesLivePercent'] = prediction
            
            # Update lag features
            remaining_indices = future_indices[i+1:]
            for j, next_idx in enumerate(remaining_indices):
                if next_idx not in df.index:
                    continue
                if j == 0:
                    df.loc[next_idx, 'Lag_1'] = prediction
                elif j == 1:
                    lag2 = df.loc[future_indices[i-1], 'PopularTimesLivePercent'] if i > 0 else prediction
                    df.loc[next_idx, 'Lag_2'] = np.clip(lag2, 0, 100)
                elif j == 2:
                    lag3 = df.loc[future_indices[i-2], 'PopularTimesLivePercent'] if i > 1 else prediction
                    df.loc[next_idx, 'Lag_3'] = np.clip(lag3, 0, 100)
                elif j == 16:
                    lag17 = df.loc[future_indices[i-16], 'PopularTimesLivePercent'] if i >= 16 else prediction
                    df.loc[next_idx, 'Lag_17'] = np.clip(lag17, 0, 100)

        results[park_name] = {'data': df, 'future_mask': future_mask}
        
    
    # STEP 2: Create Clean Visualization for 17-Hour Operations
    print(" Creating 17-hour operational visualization...")
    
    # Set up modern, clean style
    plt.style.use('default')
    
    # Modern color palette
    colors = {
        'historical': '#0279b1',    # Deep blue
        'forecast': '#4b8516',      # Green
        'transition': '#444444',    # Gray
        'background': '#FAFAFA',    # Light gray
        'grid': '#E0E0E0'          # Subtle gray
    }
    
    # CREATE INDIVIDUAL PLOTS FOR EACH PARK 
    for park_name, result in results.items():
        # Create individual figure for each park
        fig, ax = plt.subplots(1, 1, figsize=(18, 6))
        
        df = result['data']
        future_mask = result['future_mask']
        
        # Split data
        historical = df[~future_mask]
        predicted = df[future_mask]
        
        # Clean background
        ax.set_facecolor(colors['background'])
        fig.patch.set_facecolor('white')
        
        # Plot historical data
        if len(historical) > 0:
            # Main historical line
            ax.plot(historical.index, historical['PopularTimesLivePercent'], 
                   color=colors['historical'], linewidth=2.5, 
                   label='Historical Data', alpha=0.9, zorder=3)
            
            # Add hourly markers for historical data
            ax.scatter(historical.index, historical['PopularTimesLivePercent'], 
                      color=colors['historical'], s=12, alpha=0.6, zorder=4)
        
        # Plot forecast with simplified confidence analysis
        if len(predicted) > 0:
            # Main forecast line
            ax.plot(predicted.index, predicted['PopularTimesLivePercent'], 
                   color=colors['forecast'], linewidth=2.5, 
                   label='7-Day Forecast', alpha=0.9, zorder=3, linestyle='--')
            
            # Add hourly markers for forecast
            ax.scatter(predicted.index, predicted['PopularTimesLivePercent'],
                      color=colors['forecast'], s=12, alpha=0.6, 
                      marker='s', zorder=4)
            
            # Simplified confidence analysis
            pred_values = predicted['PopularTimesLivePercent'].values
            
            # Calculate overall confidence based on:
            # 1. Prediction stability (low variance = high confidence)
            # 2. Similarity to historical patterns
            # 3. Trend smoothness
            
            # Method 1: Rolling standard deviation for temporal consistency
            rolling_window = max(1, min(12, len(pred_values) // 3))
            rolling_std = (pd.Series(pred_values).rolling(window=rolling_window, center=True, min_periods=1).std())
            rolling_std = rolling_std.fillna(rolling_std.mean())
            
            # Method 2: Distance from historical mean for stability
            if len(historical) > 0:
                hist_mean = historical['PopularTimesLivePercent'].mean()
                deviation_from_hist = np.abs(pred_values - hist_mean)
                stability_factor = 1 - np.minimum(deviation_from_hist / 50, 1)
            else:
                stability_factor = np.ones(len(pred_values)) * 0.7
            
            # Method 3: Trend smoothness
            if len(pred_values) > 1:
                pred_diff = np.abs(np.diff(pred_values))
                smoothness_factor = 1 - np.minimum(pred_diff / 20, 1)
                smoothness_factor = np.concatenate([[smoothness_factor[0]], smoothness_factor])
            else:
                smoothness_factor = np.ones(len(pred_values))
            
            # Combined overall confidence (0-100%)
            variance_conf  = 100 * (1 - np.minimum(rolling_std / 15, 1))
            stability_conf = 100 * stability_factor
            smoothness_conf= 100 * smoothness_factor
            overall_conf   = 0.4*variance_conf + 0.4*stability_conf + 0.2*smoothness_conf
            avg_confidence = np.mean(overall_conf)
            
            # R² based adjustment
            r2_score = park_last_known_dict.get(park_name, {}).get('metrics', {}).get('R2', 0)
            if r2_score < 0.0:
                avg_confidence *= 0.4
            elif r2_score < 0.20:
                avg_confidence *= 0.6
            elif r2_score < 0.40:
                avg_confidence *= 0.8
            
            # Single confidence interval based on overall confidence
            confidence_factor = avg_confidence / 100
            adaptive_std = rolling_std * (2.5 - 1.5 * confidence_factor)  # Higher confidence = narrower bands
            
            upper_band = pred_values + adaptive_std
            lower_band = np.maximum(pred_values - adaptive_std, 0)
            
            # Single confidence band
            ax.fill_between(predicted.index, lower_band, upper_band, 
                           color=colors['forecast'], alpha=0.2, 
                           label=f'Confidence Interval ({avg_confidence:.0f}%)', zorder=1)
            
            # Store simplified confidence metrics
            result['confidence_metrics'] = {
                'avg_confidence': avg_confidence,
                'min_confidence': np.min(overall_conf),
                'max_confidence': np.max(overall_conf),
                'confidence_values': overall_conf
            }
        
        # Connect historical to forecast
        if len(historical) > 0 and len(predicted) > 0:
            transition_x = [historical.index[-1], predicted.index[0]]
            transition_y = [historical['PopularTimesLivePercent'].iloc[-1], 
                           predicted['PopularTimesLivePercent'].iloc[0]]
            
            ax.plot(transition_x, transition_y, 
                   color=colors['transition'], linewidth=2.5, 
                   alpha=0.8, zorder=3)
        
        # OPERATIONAL HOURS X-AXIS FORMATTING (6am-10pm only)
        data_start = df.index.min()
        data_end = df.index.max()
        
        # Create custom tick locations for operational hours
        major_times = []
        all_times = []
        
        current_date = data_start.normalize()
        end_date = data_end.normalize() + pd.Timedelta(days=1)
        
        while current_date <= end_date:
            for hour in range(6, 23):  # 6am to 10pm (17 hours)
                timestamp = current_date.replace(hour=hour)
                if data_start <= timestamp <= data_end:
                    all_times.append(timestamp)
                    # Major ticks every 4 hours during operations (6am, 10am, 2pm, 6pm, 10pm)
                    if hour in [6, 10, 14, 18, 22]:
                        major_times.append(timestamp)
            current_date += pd.Timedelta(days=1)
        
        # Set custom ticks
        ax.set_xticks(major_times)
        ax.set_xticks(all_times, minor=True)
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
        
        # Rotate labels for better readability
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right', fontsize=10,color='black')
        
        # Add secondary axis for operational days with day names
        ax2 = ax.twiny()
        ax2.set_xlim(ax.get_xlim())
        
        # Position day labels at operational midpoint (1pm)
        day_positions = []
        day_labels = []
        current_date = data_start.normalize()
        
        while current_date <= data_end.normalize():
            midpoint = current_date.replace(hour=13)  # 1pm - middle of operational day
            if data_start <= midpoint <= data_end:
                day_positions.append(midpoint)
                day_labels.append(midpoint.strftime('%a, %b %d'))  # Mon, May 16
            current_date += pd.Timedelta(days=1)
        
        ax2.set_xticks(day_positions)
        ax2.set_xticklabels(day_labels)
        ax2.tick_params(axis='x', labelsize=11, colors='black', length=4, pad=5)
        ax2.spines['top'].set_visible(False)
        ax2.spines['bottom'].set_visible(False)
        ax2.spines['left'].set_visible(False)
        ax2.spines['right'].set_visible(False)
        
        # Add operational day markers
        current_date = data_start.normalize()
        while current_date <= data_end.normalize():
            # Start of operational day (6am) - Green line
            day_start = current_date.replace(hour=6)
            if data_start <= day_start <= data_end:
                ax.axvline(x=day_start, color='#4CAF50', linestyle='-', alpha=0.6, linewidth=1.5, zorder=0)
                
            # End of operational day (10pm) - Orange line
            day_end = current_date.replace(hour=22)
            if data_start <= day_end <= data_end:
                ax.axvline(x=day_end, color='#FF9800', linestyle='-', alpha=0.6, linewidth=1.5, zorder=0)
            
            # Midday marker (1pm) - Subtle dashed line
            midday = current_date.replace(hour=13)
            if data_start <= midday <= data_end:
                ax.axvline(x=midday, color='#E0E0E0', linestyle='--', alpha=0.4, linewidth=0.8, zorder=0)
                
            current_date += pd.Timedelta(days=1)
        
        # Highlight weekends with subtle background shading
        current_date = data_start.normalize()
        while current_date <= data_end.normalize():
            if current_date.weekday() >= 5:  # Saturday = 5, Sunday = 6
                weekend_start = max(current_date.replace(hour=6), data_start)
                weekend_end = min(current_date.replace(hour=22), data_end)
                if weekend_start <= data_end and weekend_end >= data_start:
                    ax.axvspan(weekend_start, weekend_end, alpha=0.05, color='#2196F3', zorder=0)
            current_date += pd.Timedelta(days=1)
        
        # Title and labels
        ax.set_title(f'{park_name} - Operational Hours Forecast (6AM-10PM)', 
                    fontsize=18, fontweight='400', color='#2C3E50', pad=20)
        
        ax.set_ylabel('Occupancy (%)', 
                     fontsize=12, color='black', fontweight='400')
        
        # Clean axis styling
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_color('#BDC3C7')
        ax.spines['bottom'].set_color('#BDC3C7')
        
        # Enhanced grid for operational hours visibility
        ax.grid(True, which='major', alpha=0.4, color=colors['grid'], linewidth=0.8)
        ax.grid(True, which='minor', alpha=0.2, color=colors['grid'], linewidth=0.5)
        ax.set_axisbelow(True)
        
        # Elegant tick styling
        ax.tick_params(axis='both', which='major', 
                      labelsize=10, colors='black', 
                      length=5, width=1)
        ax.tick_params(axis='both', which='minor', 
                      length=3, width=0.5, colors='black')
        
        # Set reasonable y-limits
        all_values = df['PopularTimesLivePercent'].dropna()
        if len(all_values) > 0:
            y_min = max(0, all_values.min() - 5)
            y_max = min(100, all_values.max() + 10)
            ax.set_ylim(y_min, y_max)
        
        # Modern legend
        legend = ax.legend(loc='upper right', frameon=True, 
                          fancybox=True, shadow=False, 
                          fontsize=11, facecolor='white', 
                          edgecolor='#BDC3C7', framealpha=0.9)
        
        # Add confidence insights
        if len(predicted) > 0:
            pred_avg = predicted['PopularTimesLivePercent'].mean()
            hist_avg = historical['PopularTimesLivePercent'].mean() if len(historical) > 0 else 0
            
            # Find peak hour within operational hours
            pred_hourly = predicted.groupby(predicted.index.hour)['PopularTimesLivePercent'].mean()
            peak_hour = pred_hourly.idxmax()
            
            # Get confidence metrics
            conf_metrics = result.get('confidence_metrics', {})
            avg_conf = conf_metrics.get('avg_confidence', 0)
            
            # Main info box with confidence
            info_text = f'Operational Avg: {pred_avg:.0f}% | Peak: {peak_hour}:00\nAvg Confidence: {avg_conf:.0f}%'
            ax.text(0.02, 0.98, info_text, 
                   transform=ax.transAxes, 
                   bbox=dict(boxstyle="round,pad=0.5", 
                            facecolor='white', 
                            edgecolor='#BDC3C7',
                            alpha=0.9),
                   fontsize=10, color='black',
                   verticalalignment='top')
            
            # Confidence breakdown box
            min_conf = conf_metrics.get('min_confidence', 0)
            max_conf = conf_metrics.get('max_confidence', 0)
            conf_text = f'Confidence Range: {min_conf:.0f}% - {max_conf:.0f}%'
            
            ax.text(0.98, 0.98, conf_text, 
                   transform=ax.transAxes, 
                   bbox=dict(boxstyle="round,pad=0.3", 
                            facecolor='#F0F8FF', 
                            edgecolor='#87CEEB',
                            alpha=0.9),
                   fontsize=9, color='black',
                   verticalalignment='top', horizontalalignment='right')
            
            # Operational hours detail annotation
            total_hours = len(df)
            hist_hours = len(historical)
            pred_hours = len(predicted)
            
            detail_text = f'Operating Hours: 6AM-10PM (17h) | {hist_hours}h Historical | {pred_hours}h Forecast'
            ax.text(0.02, 0.02, detail_text, 
                   transform=ax.transAxes, 
                   bbox=dict(boxstyle="round,pad=0.3", 
                            facecolor='#F8F9FA', 
                            edgecolor='#DEE2E6',
                            alpha=0.9),
                   fontsize=9, color='black',
                   verticalalignment='bottom')
        
        # Final layout for individual plot
        plt.tight_layout(pad=2.0)
        
        # AUTO-SAVE INDIVIDUAL PLOT 
        if save_to_lakehouse:
            safe_name = park_name.replace("/", "-").replace(" ", "_").replace(",", "") + ".png"
            fig_path = os.path.join(save_dir, safe_name)
            plt.savefig(fig_path, dpi=300, bbox_inches='tight', facecolor='white')
        
        #Github code will be here.
        # === GitHub Configuration ===
       
        # === Config ===
        GITHUB_USERNAME = "Langara-DataHub"
        GITHUB_TOKEN = "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"  # Use a fine-grained token with repo write access
        REPO = "Van-City-Project"
        BRANCH = "develop"
        today_str = datetime.now().strftime("%Y%m%d")
        file_path = f"/lakehouse/default/Files/forecast/{today_str}/{safe_name}"
        # === Read and encode image
        with open(file_path, "rb") as f:
            content = base64.b64encode(f.read()).decode("utf-8")

        # === Create GitHub API URL
        upload_path = f"forecast_plots/{today_str}/{safe_name}"
        api_url = f"https://api.github.com/repos/{GITHUB_USERNAME}/{REPO}/contents/{upload_path}"

        # === Commit message and headers
        headers = {
            "Authorization": f"Bearer {GITHUB_TOKEN}",
            "Accept": "application/vnd.github+json"
        }

        # === Get SHA if file exists
        sha = None
        check = requests.get(f"{api_url}?ref={BRANCH}", headers=headers)
        if check.status_code == 200:
            sha = check.json().get("sha")

        data = {
            "message": f"Add forecast plot for {park_name} - {datetime.now().strftime('%Y-%m-%d')}",
            "content": content,
            "branch": BRANCH
        }

        if sha:
            data["sha"] = sha  # Required for overwrite

        # === Push to GitHub
        res = requests.put(api_url, headers=headers, json=data)

        if res.status_code in [200, 201]:
            print(f"✅ Successfully pushed or updated: {upload_path}")
        else:
            print(f"❌ Failed to push: {res.status_code}, {res.json()}")
        
        # Close to free memory
        plt.close(fig)
    
    # AUTO-SAVE COMPLETION MESSAGE
    if save_to_lakehouse:
        print("Save Completed")
    
    # STEP 3: Simplified Summary with Overall Confidence
    print("\n OPERATIONAL HOURS FORECAST SUMMARY")
    print("─" * 60)
    
    for park_name, result in results.items():
        df = result['data']
        future_mask = result['future_mask']
        
        hist_data = df[~future_mask]['PopularTimesLivePercent']
        pred_data = df[future_mask]['PopularTimesLivePercent']
        
        # Use smoothed historical data for trend comparison
        if len(hist_data) > 0:
            hist_smoothed = hist_data.rolling(window=3, center=True, min_periods=1).mean()
            hist_avg_for_comparison = hist_smoothed.mean()
        else:
            hist_avg_for_comparison = 0
        
        if len(pred_data) > 0:
            pred_hourly = df[future_mask].groupby(df[future_mask].index.hour)['PopularTimesLivePercent'].mean()
            peak_hour = pred_hourly.idxmax()
            peak_value = pred_hourly.max()
            low_hour = pred_hourly.idxmin()
            low_value = pred_hourly.min()
            
            # Get simplified confidence metrics
            conf_metrics = result.get('confidence_metrics', {})
            avg_conf = conf_metrics.get('avg_confidence', 0)
            
            print(f"\n {park_name} (6AM-10PM Operations)")
            print(f"   • Average occupancy: {pred_data.mean():.0f}%")
            print(f"   • Peak hour: {peak_hour}:00 ({peak_value:.0f}%)")
            print(f"   • Quiet hour: {low_hour}:00 ({low_value:.0f}%)")
            print(f"   • Total forecast hours: {len(pred_data)}")
            print(f"   • Overall confidence: {avg_conf:.0f}%")
            
            # Simplified confidence interpretation
            if avg_conf >= 75:
                conf_level = " High - Reliable for planning"
            elif avg_conf >= 60:
                conf_level = " Medium - Generally reliable"
            elif avg_conf >= 45:
                conf_level = " Moderate - Use with caution"
            else:
                conf_level = " Low - High uncertainty"
            
            print(f"   • Confidence level: {conf_level}")
    
    print("\n Confidence is based on prediction consistency, historical similarity, and trend smoothness.")
    
    return results
    #print(summary_df.head(3))
    #print(f"Total rows in summary: {len(summary_df)}")

def forecast_summary(forecast_results, park_attributes_df):
    """
    Use GitHub PlotURL instead of OneLake
    """
    today_str = datetime.now().strftime("%Y%m%d")
    processed_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    rows = []
    for name, res in forecast_results.items():
        try:
            df = res['data']
            future = res['future_mask']
            hist = df[~future]['PopularTimesLivePercent']
            pred = df[future]['PopularTimesLivePercent']
            if pred.empty: 
                continue

            hist_avg = hist.rolling(3, center=True, min_periods=1).mean().mean() if not hist.empty else 0
            pred_hourly = df[future].groupby(df[future].index.hour)['PopularTimesLivePercent'].mean()
            conf = res.get('confidence_metrics', {}).get('avg_confidence', 0)
            conf_cat = "High" if conf >= 75 else "Medium" if conf >= 60 else "Moderate" if conf >= 45 else "Low"

            safe_filename = name.replace("/", "-").replace(" ", "_").replace(",", "") + ".png"

            row = {
                'ParkName': name,
                'ForecastDate': datetime.now().strftime('%Y-%m-%d'),
                'ForecastStartDate': pred.index.min().strftime('%Y-%m-%d'),
                'ForecastEndDate': pred.index.max().strftime('%Y-%m-%d'),
                'AvgOccupancy': round(pred.mean(), 1),
                'PeakHour': int(pred_hourly.idxmax()),
                'PeakOccupancy': round(pred_hourly.max(), 1),
                'LowHour': int(pred_hourly.idxmin()),
                'LowOccupancy': round(pred_hourly.min(), 1),
                'TotalForecastHours': len(pred),
                'HistoricalAvg': round(hist_avg, 1),
                'ConfidenceScore': round(conf, 1),
                'ConfidenceCategory': conf_cat,
                'PlotFilename': safe_filename,
                'ProcessedDateTime': processed_time,
                'PlotURL': f"https://raw.githubusercontent.com/Langara-DataHub/Van-City-Project/develop/forecast_plots/{today_str}/{safe_filename}"
            }
        
            rows.append(row)

        except Exception as e:
            print(f"Error processing forecast summary for {name}: {e}")
        
    summary_df = pd.DataFrame(rows)
    final_df = summary_df.merge(park_attributes_df[['ParkName', 'ParkKey']], on='ParkName', how='left')
    return final_df

# Usage
forecast_results = create_forecast(merged_forecast_dict, park_last_known_dict, save_to_lakehouse=True)
dim_park_attributes = spark.read.table("dim_park_attributes").toPandas()

summary_df = forecast_summary(forecast_results, dim_park_attributes)
try:
    print(summary_df.dtypes)
    print(summary_df.head(1))
    spark.createDataFrame(summary_df).write.mode("overwrite").format("delta").saveAsTable("live_times_forecast_summary")
    print("Forecast summary table saved.")
except Exception as e:
    print("Error during saveAsTable:")
    print(str(e))

In [None]:
def get_static_predictions(forecast_results):
    """
    Extract all prediction results with features in a simple table format
    """
    
    all_data = []
    
    for park_name, result in forecast_results.items():
        df = result['data'].copy()
        future_mask = result['future_mask']
        
        # Get prediction data only
        predictions = df[future_mask].copy()
        if predictions.empty:
            continue
            
        # Add park name and reset index
        predictions = predictions.reset_index()
        predictions['ParkName'] = park_name
        
        # Get the actual datetime column name (first column after reset_index)
        datetime_col_name = predictions.columns[0]
        
        # Add confidence if available
        confidence = result.get('confidence_metrics', {}).get('avg_confidence', 0)
        predictions['Confidence'] = confidence
        
        all_data.append(predictions)
    
    # Combine all parks
    if not all_data:
        return pd.DataFrame()
        
    static_df = pd.concat(all_data, ignore_index=True)
    
    # Get the actual datetime column name (should be consistent across all parks)
    datetime_col = static_df.columns[0]  # First column is the datetime
    
    # Reorder columns: Park, Time, Target, Features
    cols = ['ParkName', datetime_col, 'PopularTimesLivePercent', 'Confidence']
    feature_cols = ['Temp', 'IsRaining', 'IsSnowing', 'IsHoliday', 'IsWeekend', 'HasEvent',
                   'HourOp', 'DayOfWeek']
    
    # Add existing feature columns
    for col in feature_cols:
        if col in static_df.columns:
            cols.append(col)
    
    static_df = static_df[cols]
    static_df = static_df.sort_values(['ParkName', datetime_col]).reset_index(drop=True)
    
    print(f"Extracted {len(static_df):,} predictions for {static_df['ParkName'].nunique()} parks")
    return static_df

# USAGE

# Extract static predictions
static_predictions = get_static_predictions(forecast_results)

# Preview
print(f"\nShape: {static_predictions.shape}")
print(f"Columns: {list(static_predictions.columns)}")
datetime_col = static_predictions.columns[1]  # Second column after ParkName
print(f"Date range: {static_predictions[datetime_col].min()} to {static_predictions[datetime_col].max()}")
print("\nSample data:")
print(static_predictions.head(10))

# Save to Delta table
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
table_name = f"live_times_forecast_details_{timestamp.replace('_', '')}"
spark.createDataFrame(static_predictions).write.mode("overwrite").format("delta").saveAsTable(table_name)
print(f"\n Saved to Delta table: {table_name}")

StatementMeta(, da73cb9d-a63b-4402-8289-ae75c02491ac, -1, Cancelled, , Cancelled)