In [3]:
import warnings
import sys
import os

warnings.filterwarnings("ignore")
# Using current working directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
# ! pip install prophet

Collecting prophet
  Downloading prophet-1.1.6-py3-none-win_amd64.whl.metadata (3.6 kB)
Collecting cmdstanpy>=1.0.4 (from prophet)
  Downloading cmdstanpy-1.2.5-py3-none-any.whl.metadata (4.0 kB)
Collecting holidays<1,>=0.25 (from prophet)
  Downloading holidays-0.71-py3-none-any.whl.metadata (34 kB)
Collecting importlib-resources (from prophet)
  Downloading importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting stanio<2.0.0,>=0.4.0 (from cmdstanpy>=1.0.4->prophet)
  Downloading stanio-0.5.1-py3-none-any.whl.metadata (1.6 kB)
Downloading prophet-1.1.6-py3-none-win_amd64.whl (13.3 MB)
   ---------------------------------------- 0.0/13.3 MB ? eta -:--:--
   ------- -------------------------------- 2.6/13.3 MB 21.6 MB/s eta 0:00:01
   --------------------------------- ------ 11.0/13.3 MB 34.4 MB/s eta 0:00:01
   ---------------------------------------- 13.3/13.3 MB 32.2 MB/s eta 0:00:00
Downloading cmdstanpy-1.2.5-py3-none-any.whl (94 kB)
Downloading holidays-0.71-py3-no

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from prophet import Prophet
from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking
from dotenv import load_dotenv
import mlflow
from mlflow.models.signature import infer_signature

# Load data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

# Split data
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Ensure y_train and y_test have a datetime index
if not isinstance(y_train.index, pd.DatetimeIndex):
    y_train.index = pd.date_range(start="2022-01-01", periods=len(y_train), freq="D")
if not isinstance(y_test.index, pd.DatetimeIndex):
    y_test.index = pd.date_range(start=y_train.index[-1] + pd.Timedelta(days=1), 
                                 periods=len(y_test), freq="D")

# Prepare data for Prophet (requires 'ds' and 'y' columns)
train_df = pd.DataFrame({
    'ds': y_train.index,
    'y': y_train.values
})
test_df = pd.DataFrame({
    'ds': y_test.index,
    'y': y_test.values
})

# Train the Prophet model
prophet_model = Prophet(
    yearly_seasonality=True,  # Enable yearly seasonality
    weekly_seasonality=True,  # Enable weekly seasonality
    daily_seasonality=True    # Enable daily seasonality
)
prophet_model.fit(train_df)

# Make future dataframe for predictions
future = prophet_model.make_future_dataframe(periods=len(y_test), freq="D")
forecast = prophet_model.predict(future)

# Extract predictions for the test period
predictions = forecast.tail(len(y_test))['yhat'].values

# Compute Mean Absolute Error (MAE)
test_mae = mean_absolute_error(y_test, predictions)
print(f"Test MAE: {test_mae:.4f}")

# Set up MLflow
load_dotenv()
mlflow = set_mlflow_tracking()

# Custom function to log Prophet model to MLflow
def log_prophet_to_mlflow(model, experiment_name, metric_name, score, test_df):
    with mlflow.start_run():
        # Log hyperparameters
        mlflow.log_param("yearly_seasonality", True)
        mlflow.log_param("weekly_seasonality", True)
        mlflow.log_param("daily_seasonality", True)
        
        # Log metric
        mlflow.log_metric(metric_name, score)
        
        # Use test_df['ds'] as input for signature inference
        input_df = test_df[['ds']]  # Prophet expects a DataFrame with 'ds'
        predictions = model.predict(input_df)['yhat']
        
        # Infer signature
        signature = infer_signature(input_df, predictions)
        
        # Log the model using Prophet flavor
        mlflow.prophet.log_model(model, "model", signature=signature)

# Log the Prophet model
log_prophet_to_mlflow(prophet_model, "Prophet", "mean_absolute_error", test_mae, test_df)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


DEBUG:cmdstanpy:cmd: where.exe tbb.dll
cwd: None
DEBUG:cmdstanpy:Adding TBB (c:\Users\Jaath\github\CDA500_Project#01_Taxi\.venv\Lib\site-packages\prophet\stan_model\cmdstan-2.33.1\stan\lib\stan_math\lib\tbb) to PATH


(55900, 674)
(55900,)
(31720, 674)
(31720,)


DEBUG:cmdstanpy:input tempfile: C:\Users\Jaath\AppData\Local\Temp\tmpvvc1bx6r\vu_c3g3f.json
DEBUG:cmdstanpy:input tempfile: C:\Users\Jaath\AppData\Local\Temp\tmpvvc1bx6r\969j6nv2.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\Jaath\\github\\CDA500_Project#01_Taxi\\.venv\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=67363', 'data', 'file=C:\\Users\\Jaath\\AppData\\Local\\Temp\\tmpvvc1bx6r\\vu_c3g3f.json', 'init=C:\\Users\\Jaath\\AppData\\Local\\Temp\\tmpvvc1bx6r\\969j6nv2.json', 'output', 'file=C:\\Users\\Jaath\\AppData\\Local\\Temp\\tmpvvc1bx6r\\prophet_modelfk6si7d0\\prophet_model-20250424194747.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
19:47:47 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
19:48:03 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:src.experiment_utils:MLflow tra

Test MAE: 33.1503
🏃 View run dapper-shoat-499 at: https://dagshub.com/jaathavan18/new_york_taxi.mlflow/#/experiments/0/runs/da0aad235cfb4551ab047b1d056bd876
🧪 View experiment at: https://dagshub.com/jaathavan18/new_york_taxi.mlflow/#/experiments/0
