# Financial Forecasting Model

## Steps
1. Read rows from Google BigQuery
2. Train forecasting model
    - ARIMA
    - LSTM
3. Generate 30 day forecasts
4. Backtest
5. Insert predicted closing prices into Google BigQuery table
6. Visualize in Looker Studio

In [3]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import datetime
import os

PROJECT_ID = os.getenv("GCP_PROJECT_ID")
TABLE_ID = "is3107-project-455413.market_data.yf_daily_json"
BIGQUERY_COLUMNS = ["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"]
TRAINING_TIMESTAMP = datetime.now()

print(PROJECT_ID)

is3107-project-455413


In [4]:
from google.cloud import bigquery

client = bigquery.Client(PROJECT_ID)
query = f"""
SELECT Ticker, Date, Open, High, Low, Close, Volume
FROM `{TABLE_ID}`
"""

df = client.query(query).to_dataframe()



In [5]:
print(df.head())
print(df.info())
tickers = df["Ticker"].unique()

  Ticker        Date        Open        High         Low       Close  \
0   AAPL  2023-01-03  128.782641  129.395510  122.742865  123.632523   
1   AAPL  2023-01-04  125.431607  127.181268  123.642412  124.907700   
2   AAPL  2023-01-05  125.668865  126.301508  123.326108  123.583115   
3   AAPL  2023-01-06  124.561702  128.792501  123.454572  128.130203   
4   AAPL  2023-01-09  128.970474  131.876686  128.397138  128.654144   

      Volume  
0  112117500  
1   89113600  
2   80962700  
3   87754700  
4   70790800  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57698 entries, 0 to 57697
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ticker  57698 non-null  object 
 1   Date    57698 non-null  dbdate 
 2   Open    57698 non-null  float64
 3   High    57698 non-null  float64
 4   Low     57698 non-null  float64
 5   Close   57698 non-null  float64
 6   Volume  57698 non-null  Int64  
dtypes: Int64(1), dbdate(1), float64

### ARIMA Forecasting

#### Model training and Backtest

In [6]:
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import warnings

warnings.filterwarnings('ignore')

forecast_records = []
best_orders_by_ticker = {}

for ticker in tickers:
    print(f"PROCESSING {ticker}")
    try:
        ticker_df = df[df["Ticker"] == ticker].sort_values("Date")
        ticker_df.set_index("Date", inplace=True)
        series = ticker_df["Close"]

        if len(series) < 20:
            print(f"Not enough data for {ticker}, skipping.")
            continue

        train_size = int(len(series) * 0.8)
        train, test = series[:train_size], series[train_size:]

        best_aic = float("inf")
        best_order = None
        best_model = None

        for p in range(0, 4):
            for d in range(0, 2):
                for q in range(0, 4):
                    try:
                        model = ARIMA(train, order=(p, d, q))
                        model_fit = model.fit()
                        if model_fit.aic < best_aic:
                            best_aic = model_fit.aic
                            best_order = (p, d, q)
                            best_model = model_fit
                    except:
                        continue

        if best_model:
            best_orders_by_ticker[ticker] = best_order
            forecast = best_model.forecast(len(test))
            forecast.index = test.index
            rmse = np.sqrt(mean_squared_error(test, forecast))
            mae = mean_absolute_error(test, forecast)
            print(f"{ticker} - ARIMA{best_order} | RMSE: {rmse:.2f}, MAE: {mae:.2f}")

            for date in test.index:
                forecast_records.append({
                    "date": date,
                    "ticker": ticker,
                    "predicted_close": forecast.loc[date],
                    "actual_close": test.loc[date],
                    "type": "backtest",
                    "rmse": rmse,
                    "mae": mae,
                    "model": f"ARIMA{best_order}",
                    "training_timestamp": TRAINING_TIMESTAMP
                })

    except Exception as e:
        print(f"Error processing {ticker}: {e}")

backtest_forecast_df = pd.DataFrame(forecast_records)
backtest_forecast_df.head()

PROCESSING AAPL
AAPL - ARIMA(0, 1, 2) | RMSE: 17.45, MAE: 13.86
PROCESSING MSFT
MSFT - ARIMA(0, 1, 0) | RMSE: 23.86, MAE: 20.28
PROCESSING NVDA
NVDA - ARIMA(2, 1, 3) | RMSE: 14.64, MAE: 11.72
PROCESSING GOOG
GOOG - ARIMA(0, 1, 0) | RMSE: 16.25, MAE: 13.40
PROCESSING GOOGL
GOOGL - ARIMA(0, 1, 0) | RMSE: 16.37, MAE: 13.53
PROCESSING AMZN
AMZN - ARIMA(2, 1, 2) | RMSE: 30.68, MAE: 26.93
PROCESSING META
META - ARIMA(0, 1, 0) | RMSE: 69.99, MAE: 55.05
PROCESSING BRK-B
BRK-B - ARIMA(2, 1, 0) | RMSE: 41.13, MAE: 32.06
PROCESSING BRK-A
BRK-A - ARIMA(0, 1, 2) | RMSE: 60923.74, MAE: 47230.57
PROCESSING AVGO
AVGO - ARIMA(0, 1, 0) | RMSE: 40.28, MAE: 31.27
PROCESSING TSM
TSM - ARIMA(3, 1, 2) | RMSE: 18.32, MAE: 14.88
PROCESSING LLY
LLY - ARIMA(2, 1, 2) | RMSE: 53.82, MAE: 44.16
PROCESSING TSLA
TSLA - ARIMA(0, 1, 0) | RMSE: 110.29, MAE: 89.40
PROCESSING WMT
WMT - ARIMA(0, 1, 0) | RMSE: 11.57, MAE: 10.11
PROCESSING JPM
JPM - ARIMA(2, 1, 2) | RMSE: 29.28, MAE: 25.57
PROCESSING V
V - ARIMA(0, 1, 0) | R

Unnamed: 0,date,ticker,predicted_close,actual_close,type,rmse,mae,model,training_timestamp
0,2024-11-01,AAPL,Date 2024-11-01 225.123453 2024-11-01 22...,Date 2024-11-01 222.420471 2024-11-01 22...,backtest,17.453411,13.859516,"ARIMA(0, 1, 2)",2025-04-23 17:16:07.184637
1,2024-11-01,AAPL,Date 2024-11-01 225.123453 2024-11-01 22...,Date 2024-11-01 222.420471 2024-11-01 22...,backtest,17.453411,13.859516,"ARIMA(0, 1, 2)",2025-04-23 17:16:07.184637
2,2024-11-04,AAPL,Date 2024-11-04 225.123451 2024-11-04 22...,Date 2024-11-04 221.522446 2024-11-04 22...,backtest,17.453411,13.859516,"ARIMA(0, 1, 2)",2025-04-23 17:16:07.184637
3,2024-11-04,AAPL,Date 2024-11-04 225.123451 2024-11-04 22...,Date 2024-11-04 221.522446 2024-11-04 22...,backtest,17.453411,13.859516,"ARIMA(0, 1, 2)",2025-04-23 17:16:07.184637
4,2024-11-05,AAPL,Date 2024-11-05 225.123451 2024-11-05 22...,Date 2024-11-05 222.95929 2024-11-05 222...,backtest,17.453411,13.859516,"ARIMA(0, 1, 2)",2025-04-23 17:16:07.184637


#### 7-Day forecast

In [7]:
from datetime import timedelta

forecast_records = []
for ticker in tickers:
    print(f"PREDICTING FUTURE FOR {ticker}")
    try:
        if ticker not in best_orders_by_ticker:
            print(f"No ARIMA order found from backtest for {ticker}, skipping.")
            continue

        order = best_orders_by_ticker[ticker]
        ticker_df = df[df["Ticker"] == ticker].sort_values("Date")
        ticker_df.set_index("Date", inplace=True)
        series = ticker_df["Close"]

        model = ARIMA(series, order=order)
        model_fit = model.fit()
        forecast = model_fit.forecast(steps=7)

        last_date = series.index[-1]
        forecast_dates = pd.date_range(start=last_date + timedelta(days=1), periods=7, freq="D")

        for date, pred in zip(forecast_dates, forecast):
            forecast_records.append({
                "date": date,
                "ticker": ticker,
                "predicted_close": pred,
                "actual_close": None,
                "type": "prediction",
                "rmse": None,
                "mae": None,
                "model": f"ARIMA{order}",
                "training_timestamp": TRAINING_TIMESTAMP
            })

    except Exception as e:
        print(f"Error predicting future for {ticker}: {e}")


prediction_forecast_df = pd.DataFrame(forecast_records)
print(prediction_forecast_df.tail())

PREDICTING FUTURE FOR AAPL
PREDICTING FUTURE FOR MSFT
PREDICTING FUTURE FOR NVDA
PREDICTING FUTURE FOR GOOG
PREDICTING FUTURE FOR GOOGL
PREDICTING FUTURE FOR AMZN
PREDICTING FUTURE FOR META
PREDICTING FUTURE FOR BRK-B
PREDICTING FUTURE FOR BRK-A
PREDICTING FUTURE FOR AVGO
PREDICTING FUTURE FOR TSM
PREDICTING FUTURE FOR LLY
PREDICTING FUTURE FOR TSLA
PREDICTING FUTURE FOR WMT
PREDICTING FUTURE FOR JPM
PREDICTING FUTURE FOR V
PREDICTING FUTURE FOR MA
PREDICTING FUTURE FOR XOM
PREDICTING FUTURE FOR NFLX
PREDICTING FUTURE FOR COST
PREDICTING FUTURE FOR AZN
PREDICTING FUTURE FOR PG
PREDICTING FUTURE FOR UNH
PREDICTING FUTURE FOR JNJ
PREDICTING FUTURE FOR ORCL
PREDICTING FUTURE FOR HD
PREDICTING FUTURE FOR KO
PREDICTING FUTURE FOR ABBV
PREDICTING FUTURE FOR SAP
PREDICTING FUTURE FOR TMUS
PREDICTING FUTURE FOR NVO
PREDICTING FUTURE FOR BAC
PREDICTING FUTURE FOR BABA
PREDICTING FUTURE FOR PM
PREDICTING FUTURE FOR ASML
PREDICTING FUTURE FOR MC.PA
PREDICTING FUTURE FOR RMS.PA
PREDICTING FUTURE F

### XGBoost Regression

#### Model training and Backtest

In [8]:
df = df.sort_values(by=["Ticker", "Date"])

for window in [5, 10, 20]:
    df[f"MA_{window}"] = df.groupby("Ticker")["Close"].transform(lambda x: x.rolling(window).mean())
    df[f"Volume_MA_{window}"] = df.groupby("Ticker")["Volume"].transform(lambda x: x.rolling(window).mean())

for lag in range(1, 8):
    df[f"lag_{lag}"] = df.groupby("Ticker")["Close"].shift(lag)

for i in range(1, 8):
    df[f"Close_t+{i}"] = df.groupby("Ticker")["Close"].shift(-i)


In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

forecast_records = []

for ticker in tickers:
    print(f"Training XGBoost for {ticker}")
    ticker_df = df[df["Ticker"] == ticker]

    feature_cols = ['Open', 'High', 'Low', 'Volume',
                    'MA_5', 'MA_10', 'MA_20',
                    'Volume_MA_5', 'Volume_MA_10', 'Volume_MA_20'] + \
                   [f"lag_{i}" for i in range(1, 8)]
    
    target_cols = [f"Close_t+{i}" for i in range(1, 8)]

    X = ticker_df[feature_cols]
    y = ticker_df[target_cols]

    # Split data for backtesting
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    model = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5))
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    all_predictions = []
    all_actuals = []

    test_dates = ticker_df["Date"].iloc[train_size:].reset_index(drop=True)

    for i in range(len(X_test)):
        actuals = y_test.iloc[i].values

        for day in range(7):  # Predict each future day (t+1 to t+7)
            predicted_close = y_pred[i][day]
            
            all_predictions.append(predicted_close)
            all_actuals.append(actuals[day])

            forecast_records.append({
                "date": test_dates[i] + pd.Timedelta(days=day+1),  # t+1 to t+7
                "ticker": ticker,
                "predicted_close": predicted_close,
                "actual_close": actuals[day],
                "type": "backtest",
                "rmse": None,
                "mae": None, 
                "model": "XGBoost",
                "training_timestamp": TRAINING_TIMESTAMP
            })

    all_predictions = np.array(all_predictions)
    all_actuals = np.array(all_actuals)

    valid_mask = ~np.isnan(all_predictions) & ~np.isnan(all_actuals)

    total_rmse = np.sqrt(mean_squared_error(all_actuals[valid_mask], all_predictions[valid_mask]))
    total_mae = mean_absolute_error(all_actuals[valid_mask], all_predictions[valid_mask])

    for record in forecast_records:
        if record["type"] == "backtest" and record["ticker"] == ticker:
            record["rmse"] = total_rmse
            record["mae"] = total_mae

    X_latest = X.iloc[[-1]]
    y_future_pred = model.predict(X_latest)[0]

    last_date = ticker_df["Date"].max()
    forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=7)

    for i, pred in enumerate(y_future_pred):
        forecast_records.append({
            "date": forecast_dates[i],
            "ticker": ticker,
            "predicted_close": pred,
            "actual_close": None,
            "type": "prediction",
            "rmse": None,
            "mae": None,
            "model": "XGBoost",
            "training_timestamp": TRAINING_TIMESTAMP
        })

forecast_df = pd.DataFrame(forecast_records)

Training XGBoost for AAPL
Training XGBoost for MSFT
Training XGBoost for NVDA
Training XGBoost for GOOG
Training XGBoost for GOOGL
Training XGBoost for AMZN
Training XGBoost for META
Training XGBoost for BRK-B
Training XGBoost for BRK-A
Training XGBoost for AVGO
Training XGBoost for TSM
Training XGBoost for LLY
Training XGBoost for TSLA
Training XGBoost for WMT
Training XGBoost for JPM
Training XGBoost for V
Training XGBoost for MA
Training XGBoost for XOM
Training XGBoost for NFLX
Training XGBoost for COST
Training XGBoost for AZN
Training XGBoost for PG
Training XGBoost for UNH
Training XGBoost for JNJ
Training XGBoost for ORCL
Training XGBoost for HD
Training XGBoost for KO
Training XGBoost for ABBV
Training XGBoost for SAP
Training XGBoost for TMUS
Training XGBoost for NVO
Training XGBoost for BAC
Training XGBoost for BABA
Training XGBoost for PM
Training XGBoost for ASML
Training XGBoost for MC.PA
Training XGBoost for RMS.PA
Training XGBoost for CVX
Training XGBoost for CRM
Traini

In [10]:
xgboost_prediction_forecast_df = pd.DataFrame(forecast_records)
print(xgboost_prediction_forecast_df.tail())

                      date ticker  predicted_close  actual_close        type  \
81258  2025-04-25 00:00:00     GE       180.417130           NaN  prediction   
81259  2025-04-26 00:00:00     GE       183.382126           NaN  prediction   
81260  2025-04-27 00:00:00     GE       176.614014           NaN  prediction   
81261  2025-04-28 00:00:00     GE       176.428482           NaN  prediction   
81262  2025-04-29 00:00:00     GE       181.173233           NaN  prediction   

       rmse  mae    model         training_timestamp  
81258   NaN  NaN  XGBoost 2025-04-23 17:16:07.184637  
81259   NaN  NaN  XGBoost 2025-04-23 17:16:07.184637  
81260   NaN  NaN  XGBoost 2025-04-23 17:16:07.184637  
81261   NaN  NaN  XGBoost 2025-04-23 17:16:07.184637  
81262   NaN  NaN  XGBoost 2025-04-23 17:16:07.184637  


In [None]:

from pandas_gbq import to_gbq

DESTINATION_TABLE_ID = "is3107-project-455413.market_data.stock_forecast"
final_forecast_df = pd.concat([backtest_forecast_df, prediction_forecast_df, xgboost_prediction_forecast_df])

final_forecast_df["date"] = pd.to_datetime(final_forecast_df["date"], errors='coerce')
final_forecast_df["training_timestamp"] = pd.to_datetime(final_forecast_df["training_timestamp"], errors='coerce')

final_forecast_df["ticker"] = final_forecast_df["ticker"].astype(str)
final_forecast_df["type"] = final_forecast_df["type"].astype(str)
final_forecast_df["model"] = final_forecast_df["model"].astype(str)

final_forecast_df["predicted_close"] = pd.to_numeric(final_forecast_df["predicted_close"], errors='coerce')
final_forecast_df["actual_close"] = pd.to_numeric(final_forecast_df["actual_close"], errors='coerce')

to_gbq(
    final_forecast_df,
    DESTINATION_TABLE_ID,
    project_id=PROJECT_ID,
    if_exists='append',
)