In [27]:
import yfinance as yf
import pandas as pd
import numpy as np
from plotly import graph_objs as go

# LR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# RF
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV

# RNN LSTM
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Dense, LSTM
from keras.optimizers import Adam

import os
import pickle

In [28]:
def getDayData(ticker, period='max'):
    """
    Get data from Yahoo finance
    Params:
    ticker_symbol: str
    startDate: str of form 'dd-mm-yyyy'
    """

    # Get the data
    data = yf.download(ticker, period=period)
    data.reset_index(inplace=True)
    data.columns = [f"{col[0]}" for col in data.columns]
    data['Date'] = data['Date'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M')) # Date fix
    return data

In [29]:
# Get data and print general information
ticker_symbol = "ABOS"
data = getDayData(ticker_symbol)

print("Columns:\n", ' - ',list(data.columns))
print("Length:\n", ' - ',len(data))
print("First and Last:\n", ' - ', data.values[0], '\n - ', data.values[-1])

fig = go.Figure()
fig.add_trace(go.Scatter(x=data['Date'], y=data['Open'], name="stock_open"))
fig.add_trace(go.Scatter(x=data['Date'], y=data['Close'], name="stock_close"))

[*********************100%***********************]  1 of 1 completed

Columns:
  -  ['Date', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']
Length:
  -  894
First and Last:
  -  ['2021-07-01 00:00' 20.100000381469727 20.100000381469727
 26.979999542236328 18.899999618530273 26.0 2246700] 
 -  ['2025-01-22 00:00' 1.75 1.75 1.809999942779541 1.5801000595092773
 1.649999976158142 587084]





### Quantitative Analysis
I scrolled through Investopedia to find factors, they are listed below to keep a record. Math behind these is a lot simpler than the math behind the RNN so I left it out, it's implemented in code.

Markets are complex adaptive systems. Using volume and other traditional factors as inputs will probably result in low accuracy models but it's a good starting point. 

#### Relative Strength Index
* Purpose: Signals bullish and bearish momentum
* Meaning: Compares a security's strength on days when prices go up to its strength on days when prices go down
    * 70\< is considered overbought
    * \>30  is considered oversold

#### Moving Average
* Purpose: Indicates trend direction of a stock over a period of time
* Simple Moving Average (SMA): calculated by taking the arithmetic mean of a set of values
* Exponential Moving Average (EMA): gives more weight to recent prices in an attempt to make them more responsive to new information

#### Moving Average Convergence Divergence (MACD)
* Purpose: monitors relationship between two exponential moving averages
* Meaning: Subtract 2 MAs, typically subtracting a 26-day exponential moving average from a 12-day moving average
##### Signal line
* Purpose: helps identify crossover
* Meaning: 9 day EMA
    * When MACD is positive, the short-term average is above the long-term average and is an indicator of upward momentum

#### Price Rate of Change (ROC)
* Purpose: Momentum based indicator that measures the percentage change in price between the current prive and the price a certain number of periods ago
* Meaning: Current price subtracted by an old price, divided by old price

#### Money Flow Index (MFI)
* Purpose: Technical indicator that generates overbought or oversold signals using prices and volume
* Meaning: Looking at previous 14 periods, for each period mark whether the typical price was higher or lower than the prior period 


In [30]:
# Technical indicators
def calculate_rsi(data, window=14):
    delta = data.diff().dropna()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
def calculate_roc(data, window=14):
    roc = ((data - data.shift(window)) / data.shift(window)) * 100
    return roc
def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data.ewm(span=short_window, adjust=False).mean()
    long_ema = data.ewm(span=long_window, adjust=False).mean()
    macd_line = short_ema - long_ema
    signal_line = macd_line.ewm(span=signal_window, adjust=False).mean()
    macd_histogram = macd_line - signal_line
    
    return pd.DataFrame({
        'MACD_Line': macd_line,
        'Signal_Line': signal_line,
        'MACD_Histogram': macd_histogram
    })
def calculate_mfi(data, window=14):
    """ 
    Calculate the Money Flow Index (MFI) for the given data.
    Params:
    data: pandas.DataFrame
    window: int (steps to look back)
    """
    typical_price = (data['High'] + data['Low'] + data['Close']) / 3
    money_flow = typical_price * data['Volume']
    
    # Identify positive and negative money flow
    positive_flow = money_flow.where(typical_price > typical_price.shift(1), 0)
    negative_flow = money_flow.where(typical_price < typical_price.shift(1), 0)
    
    # Calculate the money flow ratio and MFI
    positive_mf_sum = positive_flow.rolling(window=window).sum()
    negative_mf_sum = negative_flow.rolling(window=window).sum()
    money_flow_ratio = positive_mf_sum / negative_mf_sum
    
    mfi = 100 - (100 / (1 + money_flow_ratio))
    return mfi

In [31]:
# Add technical indicators data
data['RSI'] = calculate_rsi(data['Close'])
data['ROC'] = calculate_roc(data['Close'])
data['MFI'] = calculate_mfi(data, window=14) 

macd_df = calculate_macd(data['Close'], short_window=12, long_window=26, signal_window=9)
data = pd.concat([data, macd_df], axis=1)

In [32]:
# Graph indicators candlestick
# rsi
rsi_fig = go.Figure()
rsi_fig.add_trace(go.Scatter(x=data['Date'], y=data['RSI'], name="RSI"))
rsi_fig.update_layout(
    title="Scatter Plot: Close vs. Date with RSI",
    xaxis_title="Date",
    yaxis_title="RSI",
    template="plotly_white", 
    coloraxis_colorbar=dict(
        title="RSI"
    )
)
rsi_fig.show()
# ROC
roc_fig = go.Figure()
roc_fig.add_trace(go.Scatter(x=data['Date'], y=data['ROC'], name="ROC"))
roc_fig.update_layout(
    title="Scatter Plot: Close vs. Date with ROC",
    xaxis_title="Date",
    yaxis_title="ROC",
    template="plotly_white", 
    coloraxis_colorbar=dict(
        title="ROC"
    )
)
roc_fig.show()
# MFI
mfi_fig = go.Figure()
mfi_fig.add_trace(go.Scatter(x=data['Date'], y=data['MFI'], name="MFI"))
mfi_fig.update_layout(
    title="Scatter Plot: Close vs. Date with MFI",
    xaxis_title="Date",
    yaxis_title="MFI",
    template="plotly_white", 
    coloraxis_colorbar=dict(
        title="MFI"
    )
)
mfi_fig.show()
# MACD
macd_fig = go.Figure()
macd_fig.add_trace(go.Scatter(x=data['Date'], y=data['MACD_Line'], name="MACD_Line"))
macd_fig.update_layout(
    title="Scatter Plot: Close vs. Date with MACD",
    xaxis_title="Date",
    yaxis_title="MACD",
    template="plotly_white", 
    coloraxis_colorbar=dict(
        title="MACD"
    )
)
macd_fig.show()

### Modeling Price Prediction
We are targeting 'Close' price change and this is a supervised learning problem, meaning we would typically use these models:
- Linear Regression
- Random Forest
- Neural Networks (LSTM for sequential data)

Since model implementation is fast and easy for data this small, let's try them all!

One thing to note is we are going to try to predict the change in price as opposed to the actual price.

In [33]:
# Features and target setup

data['Target'] = data["Close"].diff().shift(-1) # Next day's price change
data = data[:-1] # Last row has NaN target since next day's price change is not available
data = data[15:] # First 14 rows have NaN indicators (caused by window in macd)

print(data.head())
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print(data.tail())

X = data[['Open', 'High', 'Low', 'Close', 'RSI', 'ROC', 'MACD_Line']].values
y = data['Target'].values
print(X.shape, y.shape)

                Date  Adj Close  Close    High        Low       Open  Volume  \
15  2021-07-23 00:00      17.08  17.08  17.309  16.480000  16.870001   95500   
16  2021-07-26 00:00      16.57  16.57  19.680  16.559999  17.170000  135000   
17  2021-07-27 00:00      14.50  14.50  17.520  14.410000  16.930000  314400   
18  2021-07-28 00:00      14.90  14.90  15.760  13.950000  14.610000  291500   
19  2021-07-29 00:00      15.32  15.32  16.125  14.340000  14.770000  144500   

          RSI        ROC        MFI  MACD_Line  Signal_Line  MACD_Histogram  \
15  33.903421 -15.779096  22.828771  -0.675051    -0.508961       -0.166090   
16  32.497589 -17.970302  21.283102  -0.746407    -0.556450       -0.189957   
17  30.664243 -22.666667  20.168795  -0.958935    -0.636947       -0.321988   
18  36.180420 -16.197981  18.855456  -1.082609    -0.726080       -0.356529   
19  31.584368 -18.941799  21.976123  -1.133662    -0.807596       -0.326066   

    Target  
15   -0.51  
16   -2.07  
17   

In [34]:
# Linear Regression Implementation

# Sequential Split: 80% train, 20% test
split_index = int(len(data) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print("Train Data spans:", data['Date'].values[0], data['Date'].values[split_index])
print("Test Data spans:", data['Date'].values[split_index], data['Date'].values[-1])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred_lr = model.predict(X_test)

Train Data spans: 2021-07-23 00:00 2024-05-08 00:00
Test Data spans: 2024-05-08 00:00 2025-01-21 00:00


### Evaluating Linear Regression Performance

In [35]:
# Plotting the results
pred_data = pd.DataFrame({
    'Date': data['Date'].values[split_index:],
    'Actual Change': y_test,
    'Pred. Change': y_pred_lr
})
pred_fig = go.Figure()

pred_fig.add_trace(
    go.Scatter(
        x=pred_data['Date'], 
        y=pred_data['Actual Change'], 
        mode='lines+markers', 
        name='Actual Close',
        line=dict(color='blue'),
        marker=dict(symbol='circle')
    )
)

pred_fig.add_trace(
    go.Scatter(
        x=pred_data['Date'], 
        y=pred_data['Pred. Change'], 
        mode='lines+markers', 
        name='Predicted Close',
        line=dict(color='orange', dash='dash'),
        marker=dict(symbol='x')
    )
)

pred_fig.update_layout(
    title="LINEAR REGRESSION: Actual vs Predicted Change in Price",
    xaxis_title="Date",
    yaxis_title="Change Price (USD)",
    template="plotly_white",
    legend=dict(title="Legend", x=0.1, y=1.1, orientation="h"),
    xaxis=dict(showgrid=True),
    yaxis=dict(showgrid=True)
)

pred_fig.show()

#### Metrics to Evaluate
- Hit Ratio 
  - Days the model predicted the **right direction** of stock price
    - If model predicted increase, did stock increase?
  - This is more related to the state prediction that will be done in different strategy but can still be tested here
- Mean Absolute Error (MAE)
- Mean Squared Error (MSE)
- R-squared ($R^{2}$)
- Symmetric Mean Absolute Percentage Error (SMAPE)
- Autocorrelation of Residuals (Make sure residuals are random)
- Simulate buying and selling
  - Buy when predicting higher prices, sell when predicting lower prices


In [36]:
# Residual Plotting
residuals = y_test - y_pred_lr
residual_fig = go.Figure()
residual_fig.add_trace(go.Scatter(x=pred_data['Date'], y=residuals, mode='lines+markers'))
residual_fig.update_layout(
    title="Residual Plot",
    xaxis_title="Date",
    yaxis_title="Residuals",
    template="plotly_white"
)
residual_fig.show()

In [37]:
# Metric Functions

# Hit Ratio
def hitRatio(actual, predicted):
    hits = 0
    for i in range(len(actual)):
        if actual[i] < 0 and predicted[i] < 0: # If both are negative
            hits += 1
        elif actual[i] > 0 and predicted[i] > 0: # If both are positive
            hits += 1
    return hits / len(actual)

# TODO: Simulate buy/sell metric

def evaluationMetrics(y_test, y_pred, pred_data):
    """
    Params: 
    y_test: (np.array) Actual values
    y_pred: (np.array) Predicted values
    pred_data: (pd.DataFrame) Dataframe with Date, Actual Close, Pred. Close, Actual Change, Pred. Change
    """
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    hr = hitRatio(pred_data["Actual Change"].values, pred_data["Pred. Change"].values)
    return hr, mae, mse, r2

In [38]:
# Print metrics
hr, mae, mse, r2 = evaluationMetrics(y_test, y_pred_lr, pred_data)
print(f"Hit Ratio: {hr}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Hit Ratio: 0.45454545454545453
Mean Absolute Error: 0.10952348017175542
Mean Squared Error: 0.019445617995811713
R-squared: -0.14926397221425414


In [39]:
def saveModel(model, title, info):
    # Create a subfolder with the name `title`
    title = "../models/" + title
    if not os.path.exists(title):
        os.makedirs(title)
    
    # Save the model (e.g., using pickle)
    model_path = os.path.join(title, "_model.pkl")
    with open(model_path, 'wb') as model_file:
        pickle.dump(model, model_file)
    
    # Save the information to a .txt file
    info_path = os.path.join(title, "_info.txt")
    with open(info_path, 'w') as info_file:
        info_file.write(info)

    print(f"Model and info successfully saved in folder: {title}")

In [40]:
# Save the model and information
info = f"""-- General Info -- \nModel: Linear Regression
Date: {pd.Timestamp.now()}
Ticker(s): {ticker_symbol}
Features: 'Open', 'High', 'Low', 'Close', 'RSI', 'ROC', 'MACD_Line'
Train Data spans: {data['Date'].values[0]} - {data['Date'].values[split_index]}
Test Data spans: {data['Date'].values[split_index]} - {data['Date'].values[-1]}
\n-- Metrics --\n
Hit Ratio: {hr}
Mean Absolute Error: {mae}
Mean Squared Error: {mse}
R-squared: {r2}
Notes: Used daily data for training and testing. Used 80% of the data for training and 20% for testing. Predicted change in price for the next day.
"""
saveModel(model, "lr_ABOS_01.22.25", info)

Model and info successfully saved in folder: ../models/lr_ABOS_01.22.25


### Random Forest Implementation

In [41]:
# This code was ran during linear regression, including it for clarity
    # data['Target'] = data['Close'].shift(-1)
    # X = data[['Open', 'High', 'Low', 'Close', 'RSI', 'ROC', 'MACD_Line']].values

    # Sequential Split: 80% train, 20% test
    # split_index = int(len(data) * 0.8)
    # X_train, X_test = X[:split_index], X[split_index:]
    # y_train, y_test = y[:split_index], y[split_index:]

    # print("Train Data spans:", data['Date'].values[0], data['Date'].values[split_index])
    # print("Test Data spans:", data['Date'].values[split_index], data['Date'].values[-1])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rf_regressor = RandomForestRegressor(random_state=42)

### Hyperparamater tuning
- n_estimators
- max_depth
- min_samples_split
- min_samples_leaf
- max_features

In [42]:
# Hyperparameter tuning
# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Time series split for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Randomized Search CV
random_search = RandomizedSearchCV(
    estimator=rf_regressor,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings sampled
    scoring='neg_mean_absolute_error',
    cv=tscv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Perform the search
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits




65 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\higgj\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\higgj\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\higgj\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\

Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 50}


In [43]:
# Best model, predicting on test data
best_rf = random_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

### Evaluating Random Forest Performance

In [44]:
# Plotting the results
pred_data = pd.DataFrame({
    'Date': data['Date'].values[split_index:],
    'Actual Change': y_test,
    'Pred. Change': y_pred_rf
})
pred_fig = go.Figure()

pred_fig.add_trace(
    go.Scatter(
        x=pred_data['Date'], 
        y=pred_data['Actual Change'], 
        mode='lines+markers', 
        name='Actual Change',
        line=dict(color='blue'),
        marker=dict(symbol='circle')
    )
)

pred_fig.add_trace(
    go.Scatter(
        x=pred_data['Date'], 
        y=pred_data['Pred. Change'], 
        mode='lines+markers', 
        name='Predicted Change',
        line=dict(color='orange', dash='dash'),
        marker=dict(symbol='x')
    )
)

pred_fig.update_layout(
    title="RANDOM FOREST: Actual vs Predicted Change in Price",
    xaxis_title="Date",
    yaxis_title="Price Change (USD)",
    template="plotly_white",
    legend=dict(title="Legend", x=0.1, y=1.1, orientation="h"),
    xaxis=dict(showgrid=True),
    yaxis=dict(showgrid=True)
)

pred_fig.show()

In [45]:
# Residual Plotting
residuals = y_test - y_pred_rf
residual_fig = go.Figure()
residual_fig.add_trace(go.Scatter(x=pred_data['Date'], y=residuals, mode='lines+markers'))
residual_fig.update_layout(
    title="Residual Plot",
    xaxis_title="Date",
    yaxis_title="Residuals",
    template="plotly_white"
)
residual_fig.show()

In [46]:
# Print metrics
hr, mae, mse, r2 = evaluationMetrics(y_test, y_pred_rf, pred_data)
print("Random Forest Model Metrics:")
print(f"     Hit Ratio: {hr}")
print(f"     Mean Absolute Error: {mae}")
print(f"     Mean Squared Error: {mse}")
print(f"     R-squared: {r2}")

Random Forest Model Metrics:
     Hit Ratio: 0.4147727272727273
     Mean Absolute Error: 0.11655397705812193
     Mean Squared Error: 0.022333240526786716
     R-squared: -0.3199266140967849


In [47]:
# Save the model
info = f"""-- General Info -- \nModel: Random Forest Regressor
Date: {pd.Timestamp.now()}
Ticker(s): {ticker_symbol}
Features: 'Open', 'High', 'Low', 'Close', 'RSI', 'ROC', 'MACD_Line'
Train Data spans: {data['Date'].values[0]} - {data['Date'].values[split_index]}
Test Data spans: {data['Date'].values[split_index]} - {data['Date'].values[-1]}
\n-- Metrics --\n
Hit Ratio: {hr}
Mean Absolute Error: {mae}
Mean Squared Error: {mse}
R-squared: {r2}
Notes: Used daily data for training and testing. Used 80% of the data for training and 20% for testing. Predicted change in price for the next day.
"""
saveModel(model, "rf_ABOS_11.26", info)

Model and info successfully saved in folder: ../models/rf_ABOS_11.26
