In [1]:
import yfinance as yf

# Load stock data

In [90]:
x = yf.Ticker("QQQ")

In [91]:
df = x.history(interval="1h", period="730d", keepna=True)
df.drop(columns=["Dividends", "Stock Splits", "Capital Gains"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5092 entries, 2022-08-01 09:30:00-04:00 to 2025-06-27 15:30:00-04:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5086 non-null   float64
 1   High    5086 non-null   float64
 2   Low     5086 non-null   float64
 3   Close   5086 non-null   float64
 4   Volume  5092 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 238.7 KB


# Impute missing values

In [92]:
df[df.isna().any(axis=1)]

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-11-25 12:30:00-05:00,,,,,0
2023-07-03 12:30:00-04:00,,,,,0
2023-11-24 12:30:00-05:00,,,,,0
2024-07-03 12:30:00-04:00,,,,,0
2024-11-29 12:30:00-05:00,,,,,0
2024-12-24 12:30:00-05:00,,,,,0


In [93]:
df["Close"] = df["Close"].ffill()
df.fillna(
    {
        "Open": df["Close"],
        "High": df["Close"],
        "Low": df["Close"],
    },
    inplace=True,
)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5092 entries, 2022-08-01 09:30:00-04:00 to 2025-06-27 15:30:00-04:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5092 non-null   float64
 1   High    5092 non-null   float64
 2   Low     5092 non-null   float64
 3   Close   5092 non-null   float64
 4   Volume  5092 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 238.7 KB


# Add time series features

In [157]:
import pandas as pd

df['delta_time'] = df.index.to_series().diff().dt.total_seconds() / 60
df['delta_time'] = df['delta_time'].fillna(0).astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5092 entries, 2022-08-01 09:30:00-04:00 to 2025-06-27 15:30:00-04:00
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Open        5092 non-null   float64
 1   High        5092 non-null   float64
 2   Low         5092 non-null   float64
 3   Close       5092 non-null   float64
 4   Volume      5092 non-null   int64  
 5   delta_time  5092 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 407.5 KB


In [158]:
df.head(40)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,delta_time
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-08-01 09:30:00-04:00,313.915009,317.799988,312.529999,315.73999,13848875,0
2022-08-01 10:30:00-04:00,315.769989,318.809998,315.100006,317.950012,7057038,60
2022-08-01 11:30:00-04:00,317.959991,318.420013,316.309998,317.000092,4346872,60
2022-08-01 12:30:00-04:00,317.0,317.098999,314.630005,314.660004,4451567,60
2022-08-01 13:30:00-04:00,314.665497,315.390015,313.690002,314.910004,5137698,60
2022-08-01 14:30:00-04:00,314.929993,316.179993,314.589996,314.869995,4498435,60
2022-08-01 15:30:00-04:00,314.869995,315.839996,314.779999,315.23999,5045600,60
2022-08-02 09:30:00-04:00,312.899994,315.100006,311.840088,313.910004,12999088,1080
2022-08-02 10:30:00-04:00,313.910004,316.23999,313.25,315.399994,6646904,60
2022-08-02 11:30:00-04:00,315.380005,318.130005,314.291595,318.019989,5832569,60


In [112]:
df['delta_time'].max()

np.float64(5400.0)

In [113]:
df[df['delta_time'] == 5400]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,delta_time
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-09-06 09:30:00-04:00,295.660004,296.600006,290.869995,292.410004,13946782,5400.0
2022-12-27 09:30:00-05:00,266.73999,266.850006,263.029999,265.23999,9805924,5400.0
2023-01-03 09:30:00-05:00,268.649994,270.154999,264.019989,264.579987,11749638,5400.0
2023-01-17 09:30:00-05:00,280.98761,282.850006,279.580994,282.200012,11354974,5400.0
2023-02-21 09:30:00-05:00,297.5,298.730011,296.109985,296.23999,13338107,5400.0
2023-04-10 09:30:00-04:00,315.070007,315.23999,313.25,313.860107,12243821,5400.0
2023-05-30 09:30:00-04:00,352.670013,353.929993,350.059998,350.700012,24991472,5400.0
2023-06-20 09:30:00-04:00,366.179993,368.320007,363.980011,364.249908,12438771,5400.0
2023-09-05 09:30:00-04:00,376.769989,377.769989,375.829987,376.959991,8513504,5400.0
2023-12-26 09:30:00-05:00,409.25,410.320007,409.149994,410.01001,4601366,5400.0


In [135]:
x = df.index.get_loc("2025-05-27 09:30:00-04:00")
df.iloc[x - 1], df.iloc[x], df.iloc[x + 1]

(Open          5.101500e+02
 High          5.111045e+02
 Low           5.088200e+02
 Close         5.092865e+02
 Volume        6.856161e+06
 delta_time    6.000000e+01
 Name: 2025-05-23 15:30:00-04:00, dtype: float64,
 Open          5.161600e+02
 High          5.191700e+02
 Low           5.145900e+02
 Close         5.188500e+02
 Volume        1.209447e+07
 delta_time    5.400000e+03
 Name: 2025-05-27 09:30:00-04:00, dtype: float64,
 Open          5.188600e+02
 High          5.195750e+02
 Low           5.183500e+02
 Close         5.193400e+02
 Volume        5.405546e+06
 delta_time    6.000000e+01
 Name: 2025-05-27 10:30:00-04:00, dtype: float64)

In [144]:
df[df['delta_time'] != 60]['delta_time'].value_counts()

delta_time
1080.0    568
3960.0    122
5400.0     20
2520.0      7
4020.0      3
4140.0      3
3900.0      3
2700.0      3
Name: count, dtype: int64

In [147]:
df[df['delta_time'] == 1080]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,delta_time
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-08-02 09:30:00-04:00,312.899994,315.100006,311.840088,313.910004,12999088,1080.0
2022-08-03 09:30:00-04:00,316.859985,320.420013,316.720001,319.353790,14682427,1080.0
2022-08-04 09:30:00-04:00,322.750000,324.519989,321.911896,322.130005,12061112,1080.0
2022-08-05 09:30:00-04:00,319.190002,323.829987,319.089996,322.959991,17652699,1080.0
2022-08-09 09:30:00-04:00,318.769989,319.029999,315.649994,316.939911,11400215,1080.0
...,...,...,...,...,...,...
2025-06-18 09:30:00-04:00,530.080017,532.179993,528.479980,531.979980,8881047,1080.0
2025-06-24 09:30:00-04:00,536.700012,538.039978,536.271973,537.405029,10883870,1080.0
2025-06-25 09:30:00-04:00,542.150024,543.309998,541.250000,541.789978,13109829,1080.0
2025-06-26 09:30:00-04:00,543.419983,543.864990,541.520020,543.781006,10024511,1080.0


# Add indicators and signals

In [None]:
from ta import add_all_ta_features

In [94]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-08-01 09:30:00-04:00,313.915009,317.799988,312.529999,315.739990,13848875
2022-08-01 10:30:00-04:00,315.769989,318.809998,315.100006,317.950012,7057038
2022-08-01 11:30:00-04:00,317.959991,318.420013,316.309998,317.000092,4346872
2022-08-01 12:30:00-04:00,317.000000,317.098999,314.630005,314.660004,4451567
2022-08-01 13:30:00-04:00,314.665497,315.390015,313.690002,314.910004,5137698
...,...,...,...,...,...
2025-06-27 11:30:00-04:00,549.260010,549.929871,548.919983,549.359985,4974769
2025-06-27 12:30:00-04:00,549.369995,549.429321,548.609985,548.760010,6454192
2025-06-27 13:30:00-04:00,548.739990,548.989929,546.179993,546.179993,9088471
2025-06-27 14:30:00-04:00,546.190002,546.609985,544.544983,546.260010,9399330


In [71]:
df_wInd = df.copy()
df_wInd = add_all_ta_features(
    df_wInd,
    open="Open",
    high="High",
    low="Low",
    close="Close",
    volume="Volume",
    fillna=False,
)
df_wInd.drop(
    columns=[
        "volume_em",
        "volume_sma_em",
        "trend_psar_up",
        "trend_psar_down",
        "trend_psar_up_indicator",
        "trend_psar_down_indicator",
    ],
    inplace=True,
)
df_wInd.info()

  self._psar[i] = high2


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5092 entries, 2022-08-01 09:30:00-04:00 to 2025-06-27 15:30:00-04:00
Data columns (total 85 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Open                     5092 non-null   float64
 1   High                     5092 non-null   float64
 2   Low                      5092 non-null   float64
 3   Close                    5092 non-null   float64
 4   Volume                   5092 non-null   int64  
 5   volume_adi               5092 non-null   float64
 6   volume_obv               5092 non-null   int64  
 7   volume_cmf               5073 non-null   float64
 8   volume_fi                5079 non-null   float64
 9   volume_vpt               5091 non-null   float64
 10  volume_vwap              5079 non-null   float64
 11  volume_mfi               5079 non-null   float64
 12  volume_nvi               5092 non-null   float64
 13  volatility_bbm           5073 

In [72]:
df_filtered = df_wInd.loc["2022-08-17 09:30:00-04:00":]
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5008 entries, 2022-08-17 09:30:00-04:00 to 2025-06-27 15:30:00-04:00
Data columns (total 85 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Open                     5008 non-null   float64
 1   High                     5008 non-null   float64
 2   Low                      5008 non-null   float64
 3   Close                    5008 non-null   float64
 4   Volume                   5008 non-null   int64  
 5   volume_adi               5008 non-null   float64
 6   volume_obv               5008 non-null   int64  
 7   volume_cmf               5008 non-null   float64
 8   volume_fi                5008 non-null   float64
 9   volume_vpt               5008 non-null   float64
 10  volume_vwap              5008 non-null   float64
 11  volume_mfi               5008 non-null   float64
 12  volume_nvi               5008 non-null   float64
 13  volatility_bbm           5008 

In [73]:
df_filtered[df_filtered.isna().any(axis=1)]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,volume_vpt,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Calculate the forcast values (labels)

In [None]:
# df["PctCng_7"] = df["Close"].pct_change(7)
# df["PctCng_7_shift"] = df["PctCng_7"].shift(-7)

df_filtered["pct_cng_7"] = df_filtered["Close"].pct_change(7)
df_filtered["pct_cng_7_shift"] = df_filtered["pct_cng_7"].shift(-7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['pct_cng_7'] = df_filtered['Close'].pct_change(7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['pct_cng_7_shift'] = df_filtered['pct_cng_7'].shift(-7)


In [None]:
df_filtered.head(35 + 7).loc[:, ["Close", "pct_cng_7", "pct_cng_7_shift"]]

Unnamed: 0_level_0,Close,pct_cng_7,pct_cng_7_shift
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-17 09:30:00-04:00,328.350006,,-0.00134
2022-08-17 10:30:00-04:00,327.709991,,0.006866
2022-08-17 11:30:00-04:00,327.786713,,0.003808
2022-08-17 12:30:00-04:00,328.529999,,0.003678
2022-08-17 13:30:00-04:00,330.494812,,-0.002829
2022-08-17 14:30:00-04:00,329.119904,,0.003211
2022-08-17 15:30:00-04:00,328.48999,,0.002496
2022-08-18 09:30:00-04:00,327.910004,-0.00134,-0.012747
2022-08-18 10:30:00-04:00,329.959991,0.006866,-0.021154
2022-08-18 11:30:00-04:00,329.035004,0.003808,-0.018053


In [76]:
df_filtered["pct_cng_7"].max(), df_filtered["pct_cng_7"].min()

(np.float64(0.11752838404639743), np.float64(-0.06300678940570459))

In [None]:
threshold_pct = 0.5 / 100
[
    (df_filtered[f"pct_cng_7"] > threshold_pct).sum(),
    (
        (df_filtered[f"pct_cng_7"] <= threshold_pct)
        & (df_filtered[f"pct_cng_7"] >= -threshold_pct)
    ).sum(),
    (df_filtered[f"pct_cng_7"] < -threshold_pct).sum(),
]

[np.int64(1722), np.int64(1864), np.int64(1415)]

In [None]:
# Loop through the df
for i in range(4, len(df) - 1):
    print(df.iloc[i - 4 : i])
    print(df.iloc[i + 1]["Close"])

In [None]:
X, y = [], []

# Create features and labels
# Using the previous 15-minute candle to predict the next 5-minute candle
# Features: Open, High, Low, Close of the current candle
# Label: [0, 0, 1] if the next candle's close is >5% than the current candle's close
#        [0, 1, 0] if the next candle's close is +-5% than the current candle's close
#        [1, 0, 0] if the next candle's close is <5% than the current candle's close
for i in range(15, len(df) - 1):
    current_candle = df.iloc[i]
    next_candle = df.iloc[i + 1]

    # Features: Open, High, Low, Close of the current candle
    features = [
        current_candle["Open"],
        current_candle["High"],
        current_candle["Low"],
        current_candle["Close"],
    ]

    # Label: [0, 0, 1] if next candle's close > 5% than current candle's close
    #        [0, 1, 0] if next candle's close +- 5% than current candle's close
    #        [1, 0, 0] if next candle's close < 5% than current candle's close
    if next_candle["Close"] > current_candle["Close"] * 1.05:
        label = [0, 0, 1]
    elif next_candle["Close"] < current_candle["Close"] * 0.95:
        label = [1, 0, 0]
    else:
        label = [0, 1, 0]

    X.append(features)
    y.append(label)