In [1]:
# Import libraries and dependencies
import os
import pandas as pd
import alpaca_trade_api as tradeapi
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from yahoo_fin.stock_info import get_data

2024-01-11 18:51:05.708838: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
# Get historical prices from first candle to the most recent candle
hist_df = get_data('DIS', index_as_date=False)

# Show the first 5 rows of our dataframe
hist_df.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,1970-03-25,0.734737,0.757216,0.734737,0.750151,0.497615,2627471,DIS
1,1970-03-26,0.750151,0.75272,0.740517,0.75272,0.49932,2082514,DIS
2,1970-03-30,0.754005,0.765565,0.754005,0.762996,0.506136,1109377,DIS
3,1970-03-31,0.762996,0.764923,0.750151,0.757858,0.502728,1868424,DIS
4,1970-04-01,0.757858,0.77841,0.747582,0.775841,0.514657,2627471,DIS


In [24]:
hist_df.tail()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
13565,2024-01-05,90.410004,91.32,90.360001,90.900002,90.900002,9084400,DIS
13566,2024-01-08,91.550003,91.940002,91.0,91.550003,91.550003,11103700,DIS
13567,2024-01-09,91.050003,91.099998,89.599998,89.669998,89.669998,11255100,DIS
13568,2024-01-10,89.82,89.82,88.879997,89.290001,89.290001,15091600,DIS
13569,2024-01-11,89.389999,89.7798,88.684998,89.449997,89.449997,9444510,DIS


Note that I want the date as a column (not as an index), so I coded index_as_date=False above. 

In [25]:
hist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13570 entries, 0 to 13569
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      13570 non-null  datetime64[ns]
 1   open      13570 non-null  float64       
 2   high      13570 non-null  float64       
 3   low       13570 non-null  float64       
 4   close     13570 non-null  float64       
 5   adjclose  13570 non-null  float64       
 6   volume    13570 non-null  int64         
 7   ticker    13570 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 848.3+ KB


In [26]:
hist_df.dtypes

date        datetime64[ns]
open               float64
high               float64
low                float64
close              float64
adjclose           float64
volume               int64
ticker              object
dtype: object

In [27]:
prices = hist_df.drop(['adjclose'], axis=1)
prices.head()

Unnamed: 0,date,open,high,low,close,volume,ticker
0,1970-03-25,0.734737,0.757216,0.734737,0.750151,2627471,DIS
1,1970-03-26,0.750151,0.75272,0.740517,0.75272,2082514,DIS
2,1970-03-30,0.754005,0.765565,0.754005,0.762996,1109377,DIS
3,1970-03-31,0.762996,0.764923,0.750151,0.757858,1868424,DIS
4,1970-04-01,0.757858,0.77841,0.747582,0.775841,2627471,DIS


In [29]:
# Add difference between closing price and opening price
# NOTE: O-to-C is the length of the candle's body

prices['O-to-C'] = prices['close'] - prices['open']
prices['O-to-C']

0        0.015414
1        0.002569
2        0.008991
3       -0.005138
4        0.017983
           ...   
13565    0.489998
13566    0.000000
13567   -1.380005
13568   -0.529999
13569    0.059998
Name: O-to-C, Length: 13570, dtype: float64

In [30]:
# Add 20-Day moving average for Open-to-Close column 

prices['OC-20D-Mean'] = prices['O-to-C'].rolling(20).mean()
prices['OC-20D-Mean']

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
13565    0.134999
13566    0.107499
13567    0.000499
13568    0.000499
13569    0.055999
Name: OC-20D-Mean, Length: 13570, dtype: float64

In [36]:
# Calculate the % change of the current day's O-to-C relative to the moving average
prices['OC-%-from-20D-Mean'] = 100*(prices['O-to-C'] - prices['OC-20D-Mean'])/prices['OC-20D-Mean']

In [38]:
# Get the maximum OC compared to the recent 10 candles (including the current candle)
prices['MaxOC_Prev10'] = prices['O-to-C'].rolling(10).max()
prices['MaxOC_Prev10']

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
13565    1.419998
13566    1.419998
13567    1.419998
13568    1.419998
13569    1.419998
Name: MaxOC_Prev10, Length: 13570, dtype: float64

In [39]:
# Add 20-Day moving average for volume 
prices['Volume-20D-Mean'] = prices['volume'].rolling(20).mean()
prices['Volume-20D-Mean'] 

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
13565    10896245.0
13566    10938195.0
13567    11026190.0
13568    11240220.0
13569    11051525.5
Name: Volume-20D-Mean, Length: 13570, dtype: float64

In [40]:
# Calculate the % change of the current volume relative to the moving average
prices['Volume-%-from-20D-Mean'] = 100*(prices['volume'] - prices['Volume-20D-Mean'])/prices['Volume-20D-Mean']
prices['Volume-%-from-20D-Mean']

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
           ...    
13565   -16.628160
13566     1.513092
13567     2.076057
13568    34.264276
13569   -14.541119
Name: Volume-%-from-20D-Mean, Length: 13570, dtype: float64

In [41]:
# Print the columns for easy copy-pasting
prices.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker', 'O-to-C',
       'OC-20D-Mean', 'OC-%-from-20D-Mean', 'MaxOC_Prev10', 'Volume-20D-Mean',
       'Volume-%-from-20D-Mean'],
      dtype='object')

In [42]:
# Rearrange the columns for our dataframe
prices = prices[['ticker', 'date', 'open', 'high', 'low', 'close', 
                 'O-to-C', 'OC-20D-Mean', 'volume', 'Volume-20D-Mean', 
                 'MaxOC_Prev10', 'Volume-%-from-20D-Mean', 'OC-%-from-20D-Mean', 
                ]]

# Show the 10 most recent rows
prices.tail(10)

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
13560,DIS,2023-12-28,90.089996,90.980003,89.970001,90.400002,0.310005,0.002499,8479600,11287250.0,1.290001,-24.874527,12303.235653
13561,DIS,2023-12-29,90.120003,90.599998,89.860001,90.290001,0.169998,0.053999,9201300,11077480.0,1.290001,-16.936885,214.815338
13562,DIS,2024-01-02,90.099998,91.480003,89.730003,90.709999,0.610001,0.092499,10587600,11012255.0,1.290001,-3.856204,559.466102
13563,DIS,2024-01-03,90.230003,92.080002,90.0,91.650002,1.419998,0.157999,11929800,10978605.0,1.419998,8.664079,798.738486
13564,DIS,2024-01-04,91.940002,91.940002,90.0,90.559998,-1.380005,0.129999,12087400,10965125.0,1.419998,10.234949,-1161.552186
13565,DIS,2024-01-05,90.410004,91.32,90.360001,90.900002,0.489998,0.134999,9084400,10896245.0,1.419998,-16.62816,262.964868
13566,DIS,2024-01-08,91.550003,91.940002,91.0,91.550003,0.0,0.107499,11103700,10938195.0,1.419998,1.513092,-100.0
13567,DIS,2024-01-09,91.050003,91.099998,89.599998,89.669998,-1.380005,0.000499,11255100,11026190.0,1.419998,2.076057,-276886.534047
13568,DIS,2024-01-10,89.82,89.82,88.879997,89.290001,-0.529999,0.000499,15091600,11240220.0,1.419998,34.264276,-106320.183486
13569,DIS,2024-01-11,89.389999,89.7798,88.684998,89.449997,0.059998,0.055999,9444510,11051525.5,1.419998,-14.541119,7.140424


In [48]:
prices = prices.dropna()
prices.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13549 entries, 19 to 13569
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ticker                  13549 non-null  object        
 1   date                    13549 non-null  datetime64[ns]
 2   open                    13549 non-null  float64       
 3   high                    13549 non-null  float64       
 4   low                     13549 non-null  float64       
 5   close                   13549 non-null  float64       
 6   O-to-C                  13549 non-null  float64       
 7   OC-20D-Mean             13549 non-null  float64       
 8   volume                  13549 non-null  int64         
 9   Volume-20D-Mean         13549 non-null  float64       
 10  MaxOC_Prev10            13549 non-null  float64       
 11  Volume-%-from-20D-Mean  13549 non-null  float64       
 12  OC-%-from-20D-Mean      13549 non-null  float64   

Here are our conditions, find every candle that:

-is green (i.e. the closing price is higher than the opening price);
-has a body that is longest in 10 days;
-has a body that is at least 100% longer than the average of the previous 20 candles (including the current candle); and
-has a volume that is at least 50% higher than the average of the previous 20 candles (including the current candle).

In [49]:
#Code for Breakout condition "is green"
prices['O-to-C'] >= 0.0

19       False
20       False
21       False
22       False
23        True
         ...  
13565     True
13566     True
13567    False
13568    False
13569     True
Name: O-to-C, Length: 13549, dtype: bool

In [50]:
#Code for Breakout condition "has a body that is longest in 10 days"
prices['O-to-C'] == prices['MaxOC_Prev10']

19       False
20       False
21       False
22       False
23        True
         ...  
13565    False
13566    False
13567    False
13568    False
13569    False
Length: 13549, dtype: bool

In [51]:
#Code for Breakout condition "has a body that is at least 100% longer than the average of the previous 20 candles"
prices['OC-%-from-20D-Mean'] >= 100.0

19        True
20        True
21        True
22        True
23       False
         ...  
13565     True
13566    False
13567    False
13568    False
13569    False
Name: OC-%-from-20D-Mean, Length: 13549, dtype: bool

In [52]:
#Code for Breakout condition "has a volume that is at least 50% higher than the average of the previous 20 candles"
prices['Volume-%-from-20D-Mean'] >= 50.0

19        True
20        True
21        True
22        True
23        True
         ...  
13565    False
13566    False
13567    False
13568    False
13569    False
Name: Volume-%-from-20D-Mean, Length: 13549, dtype: bool

In [60]:
#Putting it all together 
condition = (prices['O-to-C'] >= 0.0) & (prices['O-to-C'] == prices['MaxOC_Prev10']) & (prices['OC-%-from-20D-Mean'] >= 100.0) & (prices['Volume-%-from-20D-Mean'] >= 50.0) 
breakouts = prices[condition]
breakouts.head()

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
160,DIS,1970-11-09,0.657667,0.692991,0.657667,0.685926,0.028259,0.003821,8875013,2111708.15,0.028259,320.276495,639.483478
169,DIS,1970-11-20,0.68143,0.719323,0.68143,0.714827,0.033397,0.005106,5644197,2952498.8,0.033397,91.166784,554.073741
237,DIS,1971-03-02,0.964663,1.008337,0.964663,1.001914,0.037251,0.005523,6198885,3335428.3,0.037251,85.849745,574.42129
270,DIS,1971-04-19,1.082838,1.127796,1.082838,1.114951,0.032113,0.001349,3396249,1900050.65,0.032113,78.745182,2280.949435
275,DIS,1971-04-26,1.172753,1.226702,1.172753,1.226702,0.053949,0.005523,3483832,1909295.5,0.053949,82.466884,876.744537


In [63]:
prices['NewColumn'] = np.where(condition, 1, 0)
prices 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prices['NewColumn'] = np.where(condition, 1, 0)


Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean,NewColumn
19,DIS,1970-04-22,0.675007,0.675650,0.661520,0.666016,-0.008991,-0.002762,3503294,2232377.15,0.007064,56.931099,225.560014,0
20,DIS,1970-04-23,0.663447,0.663447,0.647391,0.655098,-0.008349,-0.003950,4184491,2310228.15,0.007064,81.128907,111.374949,0
21,DIS,1970-04-24,0.653813,0.653813,0.634546,0.634546,-0.019267,-0.005042,4671059,2439655.40,0.007064,91.463885,282.156637,0
22,DIS,1970-04-27,0.630692,0.630692,0.601148,0.604360,-0.026332,-0.006808,4515357,2609954.40,0.007064,73.005207,286.791524,0
23,DIS,1970-04-28,0.604360,0.631977,0.601791,0.615920,0.011560,-0.005973,4865687,2759817.55,0.011560,76.304662,-293.541080,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13565,DIS,2024-01-05,90.410004,91.320000,90.360001,90.900002,0.489998,0.134999,9084400,10896245.00,1.419998,-16.628160,262.964868,0
13566,DIS,2024-01-08,91.550003,91.940002,91.000000,91.550003,0.000000,0.107499,11103700,10938195.00,1.419998,1.513092,-100.000000,0
13567,DIS,2024-01-09,91.050003,91.099998,89.599998,89.669998,-1.380005,0.000499,11255100,11026190.00,1.419998,2.076057,-276886.534047,0
13568,DIS,2024-01-10,89.820000,89.820000,88.879997,89.290001,-0.529999,0.000499,15091600,11240220.00,1.419998,34.264276,-106320.183486,0


In [67]:
filtered_df = prices[prices['NewColumn'] == 1]
filtered_df

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean,NewColumn
160,DIS,1970-11-09,0.657667,0.692991,0.657667,0.685926,0.028259,0.003821,8875013,2111708.15,0.028259,320.276495,639.483478,1
169,DIS,1970-11-20,0.681430,0.719323,0.681430,0.714827,0.033397,0.005106,5644197,2952498.80,0.033397,91.166784,554.073741,1
237,DIS,1971-03-02,0.964663,1.008337,0.964663,1.001914,0.037251,0.005523,6198885,3335428.30,0.037251,85.849745,574.421290,1
270,DIS,1971-04-19,1.082838,1.127796,1.082838,1.114951,0.032113,0.001349,3396249,1900050.65,0.032113,78.745182,2280.949435,1
275,DIS,1971-04-26,1.172753,1.226702,1.172753,1.226702,0.053949,0.005523,3483832,1909295.50,0.053949,82.466884,876.744537,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12833,DIS,2021-02-08,183.850006,190.639999,183.300003,190.000000,6.149994,0.135499,16085700,9846580.00,6.149994,63.363320,4438.780750,1
12842,DIS,2021-02-22,181.740005,194.020004,181.529999,191.759995,10.019989,0.724001,18799600,12451720.00,10.019989,50.979945,1283.975333,1
12852,DIS,2021-03-08,197.309998,203.020004,193.789993,201.910004,4.600006,0.321000,25093200,14514370.00,4.600006,72.885217,1333.024117,1
12939,DIS,2021-07-12,177.710007,184.990005,177.369995,184.380005,6.669998,0.356001,21835500,9597300.00,6.669998,127.517114,1773.588246,1


In [108]:
#Using isna fuction to find "infinity" values
filtered_df.isna()

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean,NewColumn
160,False,False,False,False,False,False,False,False,False,False,False,False,False,False
169,False,False,False,False,False,False,False,False,False,False,False,False,False,False
237,False,False,False,False,False,False,False,False,False,False,False,False,False,False
270,False,False,False,False,False,False,False,False,False,False,False,False,False,False
275,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12833,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12842,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12852,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12939,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [82]:
import pandas as pd
import numpy as np

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

In [87]:
X = prices[['O-to-C', 'OC-20D-Mean', 'Volume-20D-Mean', 'MaxOC_Prev10', 'Volume-%-from-20D-Mean', 'OC-%-from-20D-Mean']]
X
y = prices['NewColumn']

In [98]:
clean_dataset(X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


Unnamed: 0,O-to-C,OC-20D-Mean,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
19,-0.008991,-0.002762,2232377.15,0.007064,56.931099,225.560014
20,-0.008349,-0.003950,2310228.15,0.007064,81.128907,111.374949
21,-0.019267,-0.005042,2439655.40,0.007064,91.463885,282.156637
22,-0.026332,-0.006808,2609954.40,0.007064,73.005207,286.791524
23,0.011560,-0.005973,2759817.55,0.011560,76.304662,-293.541080
...,...,...,...,...,...,...
13565,0.489998,0.134999,10896245.00,1.419998,-16.628160,262.964868
13566,0.000000,0.107499,10938195.00,1.419998,1.513092,-100.000000
13567,-1.380005,0.000499,11026190.00,1.419998,2.076057,-276886.534047
13568,-0.529999,0.000499,11240220.00,1.419998,34.264276,-106320.183486


In [100]:
#Using isna fuction to find "infinity" values
X.isna()

Unnamed: 0,O-to-C,OC-20D-Mean,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
19,False,False,False,False,False,False
20,False,False,False,False,False,False
21,False,False,False,False,False,False
22,False,False,False,False,False,False
23,False,False,False,False,False,False
...,...,...,...,...,...,...
13565,False,False,False,False,False,False
13566,False,False,False,False,False,False
13567,False,False,False,False,False,False
13568,False,False,False,False,False,False


In [104]:
 from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [105]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [106]:
classifier.fit(X_train, y_train) 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input X contains infinity or a value too large for dtype('float64').