In [1]:
# Import libraries and dependencies
import os
import pandas as pd
import alpaca_trade_api as tradeapi
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from yahoo_fin.stock_info import get_data
from sklearn.metrics import classification_report
import warnings
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


warnings.filterwarnings("ignore")


2024-01-22 20:13:55.574145: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Baseline

## Pull Data

In [2]:
# Get historical prices from first candle to the most recent candle
hist_df = get_data('DIS', index_as_date=False)

# Show the first 5 rows of our dataframe
hist_df.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,1970-03-25,0.734737,0.757216,0.734737,0.750151,0.497615,2627471,DIS
1,1970-03-26,0.750151,0.75272,0.740517,0.75272,0.49932,2082514,DIS
2,1970-03-30,0.754005,0.765565,0.754005,0.762996,0.506136,1109377,DIS
3,1970-03-31,0.762996,0.764923,0.750151,0.757858,0.502728,1868424,DIS
4,1970-04-01,0.757858,0.77841,0.747582,0.775841,0.514657,2627471,DIS


## Create Prices DataFrame

In [3]:
prices = hist_df.drop(['adjclose'], axis=1)
prices.head()

Unnamed: 0,date,open,high,low,close,volume,ticker
0,1970-03-25,0.734737,0.757216,0.734737,0.750151,2627471,DIS
1,1970-03-26,0.750151,0.75272,0.740517,0.75272,2082514,DIS
2,1970-03-30,0.754005,0.765565,0.754005,0.762996,1109377,DIS
3,1970-03-31,0.762996,0.764923,0.750151,0.757858,1868424,DIS
4,1970-04-01,0.757858,0.77841,0.747582,0.775841,2627471,DIS


## Create Breakout Features

In [4]:

# Add difference between closing price and opening price
# NOTE: O-to-C is the closing price is higher than the opening price
prices['O-to-C'] = prices['close'] - prices['open']
prices['OC-20D-Mean'] = prices['O-to-C'].rolling(20).mean()
# Calculate the % change of the current day's O-to-C relative to the moving average
prices['OC-%-from-20D-Mean'] = 100*(prices['O-to-C'] - prices['OC-20D-Mean'])/prices['OC-20D-Mean']
# Get the maximum OC compared to the recent 10 breakouts (including the current breakout)
prices['MaxOC_Prev10'] = prices['O-to-C'].rolling(10).max()
# Add 20-Day moving average for volume 
prices['Volume-20D-Mean'] = prices['volume'].rolling(20).mean()
# Calculate the % change of the current volume relative to the moving average
prices['Volume-%-from-20D-Mean'] = 100*(prices['volume'] - prices['Volume-20D-Mean'])/prices['Volume-20D-Mean']

# Rearrange the columns for our dataframe
prices = prices[['ticker', 'date', 'open', 'high', 'low', 'close', 
                 'O-to-C', 'OC-20D-Mean', 'volume', 'Volume-20D-Mean', 
                 'MaxOC_Prev10', 'Volume-%-from-20D-Mean', 'OC-%-from-20D-Mean', 
                ]]

#drop null values
prices = prices.dropna()

prices.tail()

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
13571,DIS,2024-01-16,90.290001,93.089996,89.459999,93.050003,2.760002,0.109499,16780900,11365470.0,2.760002,47.648096,2420.58221
13572,DIS,2024-01-17,92.580002,93.760002,90.099998,90.339996,-2.240005,-0.023001,15735200,11274945.0,2.760002,39.558996,9638.527622
13573,DIS,2024-01-18,90.660004,92.510002,90.410004,92.209999,1.549995,0.046498,12203200,11430925.0,2.760002,6.756015,3233.459128
13574,DIS,2024-01-19,92.32,93.580002,91.629997,93.059998,0.739998,0.018998,19782400,11979845.0,2.760002,65.130684,3795.144773
13575,DIS,2024-01-22,92.870003,95.230003,92.870003,95.080002,2.209999,0.227998,14798553,12066742.65,2.760002,22.63917,869.306472


## Build the Breakout Condition 

## Here are our conditions, find every breakout that:

#### -the closing price is higher than the opening price
#### -that is longest in 10 days
#### -that is at least 100% longer than the average of the previous 20 breakout (including the current breakout)
#### -has a volume that is at least 50% higher than the average of the previous 20 breakouts (including the current breakout)

In [5]:
# #Code for Breakout condition that the closing price is higher than the opening price
# prices['O-to-C'] >= 0.0
# #Code for Breakout condition "that is longest in 10 days"
# prices['O-to-C'] == prices['MaxOC_Prev10'
# #Code for Breakout condition "that is at least 100% longer than the average of the previous 20 breakout (including the current breakout)"
# prices['OC-%-from-20D-Mean'] >= 100.0
# #Code for Breakout condition "has a volume that is at least 50% higher than the average of the previous 20 breakouts (including the current breakout)"
# prices['Volume-%-from-20D-Mean'] >= 50.0
                           
#Putting it all together 
condition = (prices['O-to-C'] >= 0.0) & (prices['O-to-C'] == prices['MaxOC_Prev10']) & (prices['OC-%-from-20D-Mean'] >= 100.0) & (prices['Volume-%-from-20D-Mean'] >= 50.0) 
breakouts = prices[condition]
breakouts.shape

(202, 13)

##### By looking at the shape, we can see that Disney only has 202 breakout points going back to 1970 

## Create Breakout Column for Prediction

In [6]:
#Putting it all together 
condition = (prices['O-to-C'] >= 0.0) & (prices['O-to-C'] == prices['MaxOC_Prev10']) & (prices['OC-%-from-20D-Mean'] >= 100.0) & (prices['Volume-%-from-20D-Mean'] >= 50.0) 
breakouts = prices[condition]
breakouts.head()

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
160,DIS,1970-11-09,0.657667,0.692991,0.657667,0.685926,0.028259,0.003821,8875013,2111708.15,0.028259,320.276495,639.483478
169,DIS,1970-11-20,0.68143,0.719323,0.68143,0.714827,0.033397,0.005106,5644197,2952498.8,0.033397,91.166784,554.073741
237,DIS,1971-03-02,0.964663,1.008337,0.964663,1.001914,0.037251,0.005523,6198885,3335428.3,0.037251,85.849745,574.42129
270,DIS,1971-04-19,1.082838,1.127796,1.082838,1.114951,0.032113,0.001349,3396249,1900050.65,0.032113,78.745182,2280.949435
275,DIS,1971-04-26,1.172753,1.226702,1.172753,1.226702,0.053949,0.005523,3483832,1909295.5,0.053949,82.466884,876.744537


## Create Y - Breakout Signal

In [7]:
#Creating a new column so we can assign binary values
prices['breakout_signal'] = np.where(condition, 1, 0)
prices.head()

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean,breakout_signal
19,DIS,1970-04-22,0.675007,0.67565,0.66152,0.666016,-0.008991,-0.002762,3503294,2232377.15,0.007064,56.931099,225.560014,0
20,DIS,1970-04-23,0.663447,0.663447,0.647391,0.655098,-0.008349,-0.00395,4184491,2310228.15,0.007064,81.128907,111.374949,0
21,DIS,1970-04-24,0.653813,0.653813,0.634546,0.634546,-0.019267,-0.005042,4671059,2439655.4,0.007064,91.463885,282.156637,0
22,DIS,1970-04-27,0.630692,0.630692,0.601148,0.60436,-0.026332,-0.006808,4515357,2609954.4,0.007064,73.005207,286.791524,0
23,DIS,1970-04-28,0.60436,0.631977,0.601791,0.61592,0.01156,-0.005973,4865687,2759817.55,0.01156,76.304662,-293.54108,0


## Clean Dataset Function for X

In [8]:
# gets rid of null and infinite values 
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

## Prepare X and Y 

In [9]:
#Create raw X and Y 
X = prices[['O-to-C', 'OC-20D-Mean', 'Volume-20D-Mean', 'MaxOC_Prev10', 'Volume-%-from-20D-Mean', 'OC-%-from-20D-Mean']]
y = prices['breakout_signal'] 

#clean data
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(999, inplace=True)

# train test split 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
#scaling 
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

## Predict

In [10]:
#Model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
#Fit
classifier.fit(X_train, y_train) 
#Predict
predictions = classifier.predict(X_test)
#evaluate 
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3338
           1       0.00      0.00      0.00        51

    accuracy                           0.98      3389
   macro avg       0.49      0.50      0.50      3389
weighted avg       0.97      0.98      0.98      3389



### Although our model has a high global accuracy, it's clear that it cannot predict breakouts for Disney.

# Are we using the wrong model? Let's try XGBoost

In [11]:
# LabelEncoder which is specific to XGB 
le = LabelEncoder()
y_train = le.fit_transform(y_train)

#Model
classifier = xgb.XGBClassifier() ## notice different classifier 
#Fit
classifier.fit(X_train, y_train)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3338
           1       0.75      0.75      0.75        51

    accuracy                           0.99      3389
   macro avg       0.87      0.87      0.87      3389
weighted avg       0.99      0.99      0.99      3389



### We noticed marked improvement by simply changing the classifier to XGBoost. 

# We know we have an Imbalanced Class issue. Will under or over sampling help improve XGBoost results? 

## Create X and Y Over and Under Sampled Versions

In [12]:
# Instantiate the RandomOverSampler instance
random_oversampler = RandomOverSampler(random_state=1)

# create X and Y Oversampled
X_over, y_over = random_oversampler.fit_resample(X_train, y_train)


# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# create X and Y Undersampled 
X_under, y_under = rus.fit_resample(X_train, y_train)

## Oversampling Result

In [13]:
# LabelEncoder which is specific to XGB 
le = LabelEncoder()
y_train = le.fit_transform(y_train)

#Model
classifier = xgb.XGBClassifier() 
#Fit
classifier.fit(X_over, y_over)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      3338
           1       0.66      0.78      0.71        51

    accuracy                           0.99      3389
   macro avg       0.83      0.89      0.85      3389
weighted avg       0.99      0.99      0.99      3389



## Undersampling Result

In [14]:
# LabelEncoder which is specific to XGB 
le = LabelEncoder()
y_train = le.fit_transform(y_train)

#Model
classifier = xgb.XGBClassifier() 
#Fit
classifier.fit(X_under, y_under )
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3338
           1       0.46      0.94      0.62        51

    accuracy                           0.98      3389
   macro avg       0.73      0.96      0.80      3389
weighted avg       0.99      0.98      0.99      3389



#### After testing different combinations of sampling and classifiers, here are the results: 
###### 1. XGB Over
###### 2. XGB
###### 3. XGB Under
###### 4. LogisticRegression

# Let's try a decision tree!

In [15]:
#Model
classifier = DecisionTreeClassifier()
#Fit
classifier.fit(X_over, y_over)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3338
           1       0.70      0.61      0.65        51

    accuracy                           0.99      3389
   macro avg       0.85      0.80      0.82      3389
weighted avg       0.99      0.99      0.99      3389



# Let's try a support vector classifier!!

In [16]:
#Model
classifier = SVC()
#Fit
classifier.fit(X_over, y_over)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.21      0.35      3338
           1       0.02      0.90      0.03        51

    accuracy                           0.22      3389
   macro avg       0.51      0.56      0.19      3389
weighted avg       0.98      0.22      0.34      3389

