# A. Setup

## Prophet Stock Model Training Notebook (Jupyter-ready)

In [120]:
import yfinance as yf
import pandas as pd
import joblib
import os
from prophet import Prophet
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from datetime import datetime, timedelta

## Configuration

In [52]:
OUTPUT_DATA_PATH = "data"
OUTPUT_MODEL_PATH = "models"
REPORT_PATH = "reports"
SYMBOL = "BTC-USD"
START_DATE = "2015-01-01"


os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)
os.makedirs(OUTPUT_MODEL_PATH, exist_ok=True)
os.makedirs(REPORT_PATH, exist_ok=True)


# B. Modeling Functions

## 1. Download Historical Data

In [130]:
start_date = START_DATE
end_date = datetime.today().strftime("%Y-%m-%d")

print(f"Downloading {SYMBOL}...")
try:
    raw_btc_df = yf.download(SYMBOL, start=start_date, end=end_date, progress=False)
    if raw_btc_df.empty:
        raise ValueError(f"No data returned for {SYMBOL}. Check the symbol or date range.")

        # Flatten any MultiIndex columns (yfinance sometimes gives you two‐level headers)
    if isinstance(raw_btc_df.columns, pd.MultiIndex):
        raw_btc_df.columns = raw_btc_df.columns.droplevel(1)
        raw_btc_df = raw_btc_df.reset_index()
        raw_btc_df.columns.name = None

    raw_btc_df.to_csv(f"{OUTPUT_DATA_PATH}/{SYMBOL}_raw.csv", index=False)
    print(f"Successfully downloaded and saved data for {SYMBOL}. Rows: {len(raw_btc_df)}")

    print(f"\n{raw_btc_df.head()}")

except Exception as e:
    print(f"Error downloading data for {SYMBOL}: {e}")
    raise





Downloading BTC-USD...
Successfully downloaded and saved data for BTC-USD. Rows: 3764

        Date       Close        High         Low        Open    Volume
0 2015-01-01  314.248993  320.434998  314.002991  320.434998   8036550
1 2015-01-02  315.032013  315.838989  313.565002  314.079010   7860650
2 2015-01-03  281.082001  315.149994  281.082001  314.846008  33054400
3 2015-01-04  264.195007  287.230011  257.612000  281.145996  55629100
4 2015-01-05  274.473999  278.341003  265.084015  265.084015  43962800


## 2. Cleaning

In [131]:
def clean_stock(raw_df):
    raw_data  = raw_df.copy()

    # Rename for Prophet and convert date dtype
    cleaned_btc_df = raw_data.rename(columns={'Date':'ds','Close':'y'})
    cleaned_btc_df['ds'] = pd.to_datetime(cleaned_btc_df['ds'])

    cleaned_btc_df = (
        raw_df
          .sort_values("Date")
          .loc[:, ["Date", "Close"]]
          .dropna()
          .rename(columns={"Date":"ds","Close":"y"})
    )

    # Reset index cleanly and remove any index/column names
    cleaned_btc_df = cleaned_btc_df.reset_index(drop=True)
    cleaned_btc_df.index.name = None
    cleaned_btc_df.columns.name = None

    print(f"Preprocessed data: {cleaned_btc_df.shape[0]} rows")
    print(cleaned_btc_df.head())

    return cleaned_btc_df

## 3. Process data
To reduce Error

In [132]:
def preprocess_stock(cleaned_btc_df):
    return cleaned_btc_df


## 4. Training

In [133]:
def train_model(training_df):
    if training_df.empty or 'y' not in training_df.columns or 'ds' not in training_df.columns:
        raise ValueError("Input DataFrame is invalid or missing required columns ('ds', 'y')")

    training_df['y'] = np.log(training_df['y']) # log-transform target

    model = Prophet(
        changepoint_prior_scale=0.2,
        seasonality_mode='multiplicative'
    )
    model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    model.add_regressor('Volume')

    model.fit(training_df)


    return model

## 5. Serialize Model

In [56]:
def serialize_model(model, symbol):
    joblib.dump(model, f"{OUTPUT_MODEL_PATH}/{symbol}_prophet.pkl")

## 6. Evaluate Model

In [136]:
def evaluate_model(original_df, model, symbol):
    # split
    train = original_df[:-30]
    test  = original_df[-30:]

    # fit on train
    m = Prophet().fit(train)

    # forecast test-period
    future = m.make_future_dataframe(periods=30)
    forecast = m.predict(future)

    # merge only on the test window
    df_eval = test.merge(forecast[['ds','yhat']], on='ds')
    mape = np.mean(np.abs((df_eval['y'] - df_eval['yhat'])/df_eval['y']))

    with open(f"{REPORT_PATH}/{symbol}_report.md", "w") as f:
        f.write(f"# Prophet Model Report: {symbol}\n")
        f.write(f"- MAPE: {mape:.2f}\n")
        f.write(f"- Data points: {len(original_df)}\n")

    print(f"{symbol} | Error: {mape*100:.1f}%")

# C. Execute


In [137]:
clean_df = clean_stock(raw_btc_df)
processed_df = preprocess_stock(clean_df)
prophet_model = train_model(processed_df)
serialize_model(prophet_model, SYMBOL)
evaluate_model(processed_df, prophet_model, SYMBOL)

print("\n Bitcoin model trained, serialized, and reported.")

Preprocessed data: 3764 rows
          ds           y
0 2015-01-01  314.248993
1 2015-01-02  315.032013
2 2015-01-03  281.082001
3 2015-01-04  264.195007
4 2015-01-05  274.473999


22:06:31 - cmdstanpy - INFO - Chain [1] start processing
22:06:32 - cmdstanpy - INFO - Chain [1] done processing
22:06:32 - cmdstanpy - INFO - Chain [1] start processing
22:06:33 - cmdstanpy - INFO - Chain [1] done processing


BTC-USD | Error: 17.8%

 Bitcoin model trained, serialized, and reported.
