In [1]:
import joblib
from prophet import Prophet
from datetime import datetime, timedelta
from yahoo_fin import stock_info as si
import pandas as pd
import matplotlib.pyplot as plt
from prophet.diagnostics import cross_validation, performance_metrics

# Display plots inline
%matplotlib inline
    

In [2]:
# List of stocks
stocks = ["AAPL", "ABBV", "ADBE", "AMZN", "AVGO", "BRK-B", "CRM", "COST", "CVX", "HD", 
          "JNJ", "JPM", "LLY", "MA", "META", "MRK", "MSFT", "NVDA", "PG", "TSLA", "UNH", "V", "XOM"]

start_date = '2023-09-21'
end_date = '2023-12-20'

In [3]:
def fetch_data(stock):
    # end_date = datetime(2024, 2, 9)
    # start_date = end_date - timedelta(days=2*365)
    data = si.get_data(stock, start_date=start_date, end_date=end_date)
    data.reset_index(inplace=True)
    data.rename(columns={'index': 'date'}, inplace=True)
    return data

In [4]:
def calculate_technical_indicators(df):
    # Calculate Simple Moving Average (SMA)
    df['SMA_10'] = df['close'].rolling(window=10).mean()

    # Calculate Relative Strength Index (RSI)
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI_14'] = 100 - (100 / (1 + rs))

    # Calculate Bollinger Bands
    df['Upper_Band'] = df['close'].rolling(window=20).mean() + 2 * df['close'].rolling(window=20).std()
    df['Lower_Band'] = df['close'].rolling(window=20).mean() - 2 * df['close'].rolling(window=20).std()

    # Calculate lagged closing prices
    df['Close_1'] = df['close'].shift(1)
    df['Close_2'] = df['close'].shift(2)

    df.dropna(inplace=True)
    return df

In [5]:
def prepare_data_for_prophet(df):
    df_prophet = df[['date', 'close', 'SMA_10', 'RSI_14', 'Upper_Band', 'Lower_Band', 'Close_1', 'Close_2']]
    df_prophet.rename(columns={'date': 'ds', 'close': 'y'}, inplace=True)
    return df_prophet

In [6]:
def evaluate_model(model, data, horizon='90 days', period='30 days', initial='365 days'):
    df_cv = cross_validation(model, initial=initial, period=period, horizon=horizon, parallel="processes")
    df_p = performance_metrics(df_cv)
    return df_p

In [7]:
def find_optimal_retraining_period(stock, model_path, data_prophet):
    retraining_periods = ['30 days', '60 days', '90 days', '120 days']
    best_rmse = float('inf')
    best_period = None
    performance_results = {}

    try:
        model = joblib.load(model_path)
    except FileNotFoundError:
        print(f"Model for {stock} not found. Skipping.")
        return None, None

    for period in retraining_periods:
        df_p = evaluate_model(model, data_prophet, period=period)
        avg_rmse = df_p['rmse'].mean()
        performance_results[period] = df_p
        if avg_rmse < best_rmse:
            best_rmse = avg_rmse
            best_period = period

    return best_period, performance_results

In [8]:
# Dictionary to store optimal retraining periods for each stock
optimal_retraining_periods = {}

# Loop through each stock
for stock in stocks:
    print(f"Processing {stock}...")

    # Fetch historical data
    data = fetch_data(stock)

    # Calculate technical indicators
    data = calculate_technical_indicators(data)

    # Prepare the data for Prophet
    data_prophet = prepare_data_for_prophet(data)

    # Load the corresponding Prophet model (assuming you have a separate model for each stock)
    model_path = f'Models/{stock}_prophet_model.pkl'


    # Find the optimal retraining period
    best_period, performance_results = find_optimal_retraining_period(stock, model_path, data_prophet)

    print(f"performance_results structure: {performance_results}")


    if best_period:
        optimal_retraining_periods[stock] = best_period
        print(f"Optimal retraining period for {stock}: {best_period}")

        # Display the performance metrics for the best retraining period
        df_cv, best_metrics = performance_results[best_period]
        best_metrics['date'] = df_cv['ds']
        best_metrics['yhat'] = df_cv['yhat']
        print(f"Performance metrics for {stock} with optimal retraining period ({best_period}):")
        display(best_metrics)
    else:
        print(f"Skipping {stock} due to missing model.")

Processing AAPL...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prophet.rename(columns={'date': 'ds', 'close': 'y'}, inplace=True)


performance_results structure: {'30 days':    horizon       mse      rmse       mae      mape     mdape     smape  \
0   9 days  3.811635  1.952341  1.593968  0.009275  0.007303  0.009256   
1  10 days  3.409542  1.846495  1.492861  0.008663  0.007303  0.008639   
2  11 days  3.220469  1.794567  1.400858  0.008135  0.007303  0.008098   
3  12 days  3.756517  1.938174  1.523590  0.008849  0.008098  0.008819   
4  13 days  3.789706  1.946717  1.547422  0.008963  0.008098  0.008923   
..     ...       ...       ...       ...       ...       ...       ...   
77 86 days  5.333598  2.309458  1.794412  0.009907  0.008768  0.009891   
78 87 days  6.427222  2.535197  1.941447  0.010762  0.009078  0.010765   
79 88 days  7.077805  2.660415  2.102769  0.011641  0.010646  0.011644   
80 89 days  7.723384  2.779098  2.300518  0.012696  0.012163  0.012712   
81 90 days  8.174936  2.859185  2.376971  0.013193  0.015042  0.013252   

    coverage  
0   0.875000  
1   0.875000  
2   0.875000  
3   0.83

ValueError: too many values to unpack (expected 2)

In [47]:
# Display the optimal retraining periods for each stock
print("Optimal retraining periods for each stock:")
for stock, period in optimal_retraining_periods.items():
    print(f"{stock}: {period}")

Optimal retraining periods for each stock:
AAPL: 60 days
ABBV: 120 days
ADBE: 90 days
AMZN: 60 days
AVGO: 120 days
BRK-B: 120 days
CRM: 60 days
COST: 60 days
CVX: 120 days
HD: 60 days
JNJ: 120 days
JPM: 30 days
LLY: 90 days
MA: 120 days
META: 60 days
MRK: 60 days
MSFT: 30 days
NVDA: 120 days
PG: 120 days
TSLA: 30 days
UNH: 120 days
V: 120 days
XOM: 120 days
