# Financial Forecasting Model

## Steps
1. Read rows from Google BigQuery
2. Train forecasting model
    - ARIMA
    - LSTM
3. Generate 30 day forecasts
4. Backtest
5. Insert predicted closing prices into Google BigQuery table
6. Visualize in Looker Studio

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import datetime
import os

PROJECT_ID = os.getenv("GCP_PROJECT_ID")
TABLE_ID = "is3107-project-455413.market_data.yf_daily_json"
BIGQUERY_COLUMNS = ["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"]
TRAINING_TIMESTAMP = datetime.now()

print(PROJECT_ID)

is3107-project-455413


In [None]:
from google.cloud import bigquery

client = bigquery.Client(PROJECT_ID)
query = f"""
SELECT Ticker, Date, Open, High, Low, Close, Volume
FROM `{TABLE_ID}`
WHERE 
    Ticker in ('AAPL', 'NVDA', 'MSFT')
    AND Date >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR)
"""

df = client.query(query).to_dataframe()

ValueError: Please install the 'db-dtypes' package to use this function.

In [5]:
print(df.head())
print(df.info())
tickers = df["Ticker"].unique()

  Ticker        Date        Open        High         Low       Close    Volume
0   AAPL  2024-04-24  165.757322  168.504360  165.428886  168.225677  48251800
1   AAPL  2024-04-25  168.733259  169.808185  167.359740  169.091568  50558300
2   AAPL  2024-04-26  169.081625  170.534755  168.384902  168.504349  44838400
3   AAPL  2024-04-29  172.555222  175.202725  172.286502  172.684616  68169400
4   AAPL  2024-04-30  172.515412  174.167614  169.201060  169.529510  65934800
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ticker  750 non-null    object 
 1   Date    750 non-null    dbdate 
 2   Open    750 non-null    float64
 3   High    750 non-null    float64
 4   Low     750 non-null    float64
 5   Close   750 non-null    float64
 6   Volume  750 non-null    Int64  
dtypes: Int64(1), dbdate(1), float64(4), object(1)
memory usage: 41.9+ KB
None


### ARIMA Forecasting

#### Model training and Backtest

In [6]:
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import warnings

warnings.filterwarnings('ignore')

forecast_records = []
best_orders_by_ticker = {}

for ticker in tickers:
    print(f"PROCESSING {ticker}")
    try:
        ticker_df = df[df["Ticker"] == ticker].sort_values("Date")
        ticker_df.set_index("Date", inplace=True)
        series = ticker_df["Close"]

        if len(series) < 20:
            print(f"Not enough data for {ticker}, skipping.")
            continue

        train_size = int(len(series) * 0.8)
        train, test = series[:train_size], series[train_size:]

        best_aic = float("inf")
        best_order = None
        best_model = None

        for p in range(0, 4):
            for d in range(0, 2):
                for q in range(0, 4):
                    try:
                        model = ARIMA(train, order=(p, d, q))
                        model_fit = model.fit()
                        if model_fit.aic < best_aic:
                            best_aic = model_fit.aic
                            best_order = (p, d, q)
                            best_model = model_fit
                    except:
                        continue

        if best_model:
            best_orders_by_ticker[ticker] = best_order
            forecast = best_model.forecast(len(test))
            forecast.index = test.index
            rmse = np.sqrt(mean_squared_error(test, forecast))
            mae = mean_absolute_error(test, forecast)
            print(f"{ticker} - ARIMA{best_order} | RMSE: {rmse:.2f}, MAE: {mae:.2f}")

            for date in test.index:
                forecast_records.append({
                    "date": date,
                    "ticker": ticker,
                    "predicted_close": forecast.loc[date],
                    "actual_close": test.loc[date],
                    "type": "backtest",
                    "rmse": rmse,
                    "mae": mae,
                    "model": f"ARIMA{best_order}",
                    "training_timestamp": TRAINING_TIMESTAMP
                })

    except Exception as e:
        print(f"Error processing {ticker}: {e}")

backtest_forecast_df = pd.DataFrame(forecast_records)
backtest_forecast_df.head()

PROCESSING AAPL
AAPL - ARIMA(2, 1, 3) | RMSE: 20.52, MAE: 16.95
PROCESSING MSFT
MSFT - ARIMA(0, 1, 0) | RMSE: 27.68, MAE: 23.72
PROCESSING NVDA
NVDA - ARIMA(1, 1, 1) | RMSE: 21.10, MAE: 18.38


Unnamed: 0,date,ticker,predicted_close,actual_close,type,rmse,mae,model,training_timestamp
0,2025-02-11,AAPL,226.579888,232.619995,backtest,20.516729,16.952645,"ARIMA(2, 1, 3)",2025-04-24 23:30:22.387106
1,2025-02-12,AAPL,227.212975,236.869995,backtest,20.516729,16.952645,"ARIMA(2, 1, 3)",2025-04-24 23:30:22.387106
2,2025-02-13,AAPL,226.999487,241.529999,backtest,20.516729,16.952645,"ARIMA(2, 1, 3)",2025-04-24 23:30:22.387106
3,2025-02-14,AAPL,226.783258,244.600006,backtest,20.516729,16.952645,"ARIMA(2, 1, 3)",2025-04-24 23:30:22.387106
4,2025-02-18,AAPL,227.401567,244.470001,backtest,20.516729,16.952645,"ARIMA(2, 1, 3)",2025-04-24 23:30:22.387106


#### 7-Day forecast

In [7]:
from datetime import timedelta

forecast_records = []
for ticker in tickers:
    print(f"PREDICTING FUTURE FOR {ticker}")
    try:
        if ticker not in best_orders_by_ticker:
            print(f"No ARIMA order found from backtest for {ticker}, skipping.")
            continue

        order = best_orders_by_ticker[ticker]
        ticker_df = df[df["Ticker"] == ticker].sort_values("Date")
        ticker_df.set_index("Date", inplace=True)
        series = ticker_df["Close"]

        model = ARIMA(series, order=order)
        model_fit = model.fit()
        forecast = model_fit.forecast(steps=7)

        last_date = series.index[-1]
        forecast_dates = pd.date_range(start=last_date + timedelta(days=1), periods=7, freq="D")

        for date, pred in zip(forecast_dates, forecast):
            forecast_records.append({
                "date": date,
                "ticker": ticker,
                "predicted_close": pred,
                "actual_close": None,
                "type": "prediction",
                "rmse": None,
                "mae": None,
                "model": f"ARIMA{order}",
                "training_timestamp": TRAINING_TIMESTAMP
            })

    except Exception as e:
        print(f"Error predicting future for {ticker}: {e}")


prediction_forecast_df = pd.DataFrame(forecast_records)
print(prediction_forecast_df.tail())

PREDICTING FUTURE FOR AAPL
PREDICTING FUTURE FOR MSFT
PREDICTING FUTURE FOR NVDA
         date ticker  predicted_close actual_close        type  rmse   mae  \
16 2025-04-26   NVDA       102.846001         None  prediction  None  None   
17 2025-04-27   NVDA       102.766716         None  prediction  None  None   
18 2025-04-28   NVDA       102.828999         None  prediction  None  None   
19 2025-04-29   NVDA       102.780072         None  prediction  None  None   
20 2025-04-30   NVDA       102.818507         None  prediction  None  None   

             model         training_timestamp  
16  ARIMA(1, 1, 1) 2025-04-24 23:30:22.387106  
17  ARIMA(1, 1, 1) 2025-04-24 23:30:22.387106  
18  ARIMA(1, 1, 1) 2025-04-24 23:30:22.387106  
19  ARIMA(1, 1, 1) 2025-04-24 23:30:22.387106  
20  ARIMA(1, 1, 1) 2025-04-24 23:30:22.387106  


### XGBoost Regression

#### Model training and Backtest

In [8]:
df = df.sort_values(by=["Ticker", "Date"])

for window in [5, 10, 20]:
    df[f"MA_{window}"] = df.groupby("Ticker")["Close"].transform(lambda x: x.rolling(window).mean())
    df[f"Volume_MA_{window}"] = df.groupby("Ticker")["Volume"].transform(lambda x: x.rolling(window).mean())

for lag in range(1, 8):
    df[f"lag_{lag}"] = df.groupby("Ticker")["Close"].shift(lag)

for i in range(1, 8):
    df[f"Close_t+{i}"] = df.groupby("Ticker")["Close"].shift(-i)


In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

forecast_records = []

for ticker in tickers:
    print(f"Training XGBoost for {ticker}")
    ticker_df = df[df["Ticker"] == ticker]

    feature_cols = ['Open', 'High', 'Low', 'Volume',
                    'MA_5', 'MA_10', 'MA_20',
                    'Volume_MA_5', 'Volume_MA_10', 'Volume_MA_20'] + \
                   [f"lag_{i}" for i in range(1, 8)]
    
    target_cols = [f"Close_t+{i}" for i in range(1, 8)]

    X = ticker_df[feature_cols]
    y = ticker_df[target_cols]

    # Split data for backtesting
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    model = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5))
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    all_predictions = []
    all_actuals = []

    test_dates = ticker_df["Date"].iloc[train_size:].reset_index(drop=True)

    for i in range(len(X_test)):
        actuals = y_test.iloc[i].values

        for day in range(7):  # Predict each future day (t+1 to t+7)
            predicted_close = y_pred[i][day]
            
            all_predictions.append(predicted_close)
            all_actuals.append(actuals[day])

            forecast_records.append({
                "date": test_dates[i] + pd.Timedelta(days=day+1),  # t+1 to t+7
                "ticker": ticker,
                "predicted_close": predicted_close,
                "actual_close": actuals[day],
                "type": "backtest",
                "rmse": None,
                "mae": None, 
                "model": "XGBoost",
                "training_timestamp": TRAINING_TIMESTAMP
            })

    all_predictions = np.array(all_predictions)
    all_actuals = np.array(all_actuals)

    valid_mask = ~np.isnan(all_predictions) & ~np.isnan(all_actuals)

    total_rmse = np.sqrt(mean_squared_error(all_actuals[valid_mask], all_predictions[valid_mask]))
    total_mae = mean_absolute_error(all_actuals[valid_mask], all_predictions[valid_mask])

    for record in forecast_records:
        if record["type"] == "backtest" and record["ticker"] == ticker:
            record["rmse"] = total_rmse
            record["mae"] = total_mae

    X_latest = X.iloc[[-1]]
    y_future_pred = model.predict(X_latest)[0]

    last_date = ticker_df["Date"].max()
    forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=7)

    for i, pred in enumerate(y_future_pred):
        forecast_records.append({
            "date": forecast_dates[i],
            "ticker": ticker,
            "predicted_close": pred,
            "actual_close": None,
            "type": "prediction",
            "rmse": None,
            "mae": None,
            "model": "XGBoost",
            "training_timestamp": TRAINING_TIMESTAMP
        })

forecast_df = pd.DataFrame(forecast_records)

Training XGBoost for AAPL
Training XGBoost for MSFT
Training XGBoost for NVDA


In [10]:
xgboost_prediction_forecast_df = pd.DataFrame(forecast_records)
print(xgboost_prediction_forecast_df.tail())

                     date ticker  predicted_close  actual_close        type  \
1066  2025-04-26 00:00:00   NVDA       113.823341           NaN  prediction   
1067  2025-04-27 00:00:00   NVDA       118.419632           NaN  prediction   
1068  2025-04-28 00:00:00   NVDA       117.731133           NaN  prediction   
1069  2025-04-29 00:00:00   NVDA       116.603783           NaN  prediction   
1070  2025-04-30 00:00:00   NVDA       116.843796           NaN  prediction   

      rmse  mae    model         training_timestamp  
1066   NaN  NaN  XGBoost 2025-04-24 23:30:22.387106  
1067   NaN  NaN  XGBoost 2025-04-24 23:30:22.387106  
1068   NaN  NaN  XGBoost 2025-04-24 23:30:22.387106  
1069   NaN  NaN  XGBoost 2025-04-24 23:30:22.387106  
1070   NaN  NaN  XGBoost 2025-04-24 23:30:22.387106  


In [None]:

from pandas_gbq import to_gbq

DESTINATION_TABLE_ID = "is3107-project-455413.market_data.stock_forecast"
final_forecast_df = pd.concat([backtest_forecast_df, prediction_forecast_df, xgboost_prediction_forecast_df])

final_forecast_df["date"] = pd.to_datetime(final_forecast_df["date"], errors='coerce')
final_forecast_df["training_timestamp"] = pd.to_datetime(final_forecast_df["training_timestamp"], errors='coerce')

final_forecast_df["ticker"] = final_forecast_df["ticker"].astype(str)
final_forecast_df["type"] = final_forecast_df["type"].astype(str)
final_forecast_df["model"] = final_forecast_df["model"].astype(str)

final_forecast_df["predicted_close"] = pd.to_numeric(final_forecast_df["predicted_close"], errors='coerce')
final_forecast_df["actual_close"] = pd.to_numeric(final_forecast_df["actual_close"], errors='coerce')

to_gbq(
    final_forecast_df,
    DESTINATION_TABLE_ID,
    project_id=PROJECT_ID,
    if_exists='append',
)