In [6]:
# Import libraries and dependencies
import os
import pandas as pd
import alpaca_trade_api as tradeapi
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.preprocessing import MinMaxScaler
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Dropout, LSTM
#from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from yahoo_fin.stock_info import get_data
from sklearn.metrics import classification_report
import warnings
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


warnings.filterwarnings("ignore")


# Baseline

## Pull Data

In [None]:
# Get historical prices from first candle to the most recent candle
hist_df = get_data('DIS', index_as_date=False)

# Show the first 5 rows of our dataframe
hist_df.head()

## Create Prices DataFrame

In [None]:
prices = hist_df.drop(['adjclose'], axis=1)
prices.head()

## Create Breakout Features

In [None]:

# Add difference between closing price and opening price
# NOTE: O-to-C is the length of the candle's body
prices['O-to-C'] = prices['close'] - prices['open']
prices['OC-20D-Mean'] = prices['O-to-C'].rolling(20).mean()
# Calculate the % change of the current day's O-to-C relative to the moving average
prices['OC-%-from-20D-Mean'] = 100*(prices['O-to-C'] - prices['OC-20D-Mean'])/prices['OC-20D-Mean']
# Get the maximum OC compared to the recent 10 candles (including the current candle)
prices['MaxOC_Prev10'] = prices['O-to-C'].rolling(10).max()
# Add 20-Day moving average for volume 
prices['Volume-20D-Mean'] = prices['volume'].rolling(20).mean()
# Calculate the % change of the current volume relative to the moving average
prices['Volume-%-from-20D-Mean'] = 100*(prices['volume'] - prices['Volume-20D-Mean'])/prices['Volume-20D-Mean']

# Rearrange the columns for our dataframe
prices = prices[['ticker', 'date', 'open', 'high', 'low', 'close', 
                 'O-to-C', 'OC-20D-Mean', 'volume', 'Volume-20D-Mean', 
                 'MaxOC_Prev10', 'Volume-%-from-20D-Mean', 'OC-%-from-20D-Mean', 
                ]]

#drop null values
prices = prices.dropna()

prices.tail()

## Build the Breakout Condition 

In [None]:
# #Code for Breakout condition "is green"
# prices['O-to-C'] >= 0.0
# #Code for Breakout condition "has a body that is longest in 10 days"
# prices['O-to-C'] == prices['MaxOC_Prev10'
# #Code for Breakout condition "has a body that is at least 100% longer than the average of the previous 20 candles"
# prices['OC-%-from-20D-Mean'] >= 100.0
# #Code for Breakout condition "has a volume that is at least 50% higher than the average of the previous 20 candles"
# prices['Volume-%-from-20D-Mean'] >= 50.0
                           
#Putting it all together 
condition = (prices['O-to-C'] >= 0.0) & (prices['O-to-C'] == prices['MaxOC_Prev10']) & (prices['OC-%-from-20D-Mean'] >= 100.0) & (prices['Volume-%-from-20D-Mean'] >= 50.0) 
breakouts = prices[condition]
breakouts.shape

##### disney only has 202 breakout points going back to 1970 

## Create Breakout Column for Prediction

In [None]:
#Putting it all together 
condition = (prices['O-to-C'] >= 0.0) & (prices['O-to-C'] == prices['MaxOC_Prev10']) & (prices['OC-%-from-20D-Mean'] >= 100.0) & (prices['Volume-%-from-20D-Mean'] >= 50.0) 
breakouts = prices[condition]
breakouts.head()

## Create Y - Breakout Signal

In [None]:
#Creating a new column so we can assign binary values
prices['breakout_signal'] = np.where(condition, 1, 0)
prices.head()

## Clean Dataset Function for X

In [None]:
# gets rid of null and infinite values 
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

## Prepare X and Y 

In [None]:
#Create raw X and Y 
X = prices[['O-to-C', 'OC-20D-Mean', 'Volume-20D-Mean', 'MaxOC_Prev10', 'Volume-%-from-20D-Mean', 'OC-%-from-20D-Mean']]
y = prices['breakout_signal'] 

#clean data
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(999, inplace=True)

# train test split 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
#scaling 
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

## Predict

In [None]:
#Model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
#Fit
classifier.fit(X_train, y_train) 
#Predict
predictions = classifier.predict(X_test)
#evaluate 
print(classification_report(y_test, predictions))

### although our model has a high global accuracy its clear that it cannot predict breakouts for disney 

# Are we using the wrong model? Let's try XGBoost

In [None]:
# LabelEncoder which is specific to XGB 
le = LabelEncoder()
y_train = le.fit_transform(y_train)

#Model
classifier = xgb.XGBClassifier() ## notice different classifier 
#Fit
classifier.fit(X_train, y_train)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

### we noticed marked improvement by simply changing the classifier to XGBoost. This is where you talk about the performance history of XGBoost in OSS data science competitions on Kaggle

# We know we have Imbalanced Class issue, will under or over sampling help improve XGBoost results? 

## Create X and Y Over and Under Sampled Versions

In [None]:
# Instantiate the RandomOverSampler instance
random_oversampler = RandomOverSampler(random_state=1)

# create X and Y Oversampled
X_over, y_over = random_oversampler.fit_resample(X_train, y_train)


# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# create X and Y Undersampled 
X_under, y_under = rus.fit_resample(X_train, y_train)

## Oversampling Result

In [None]:
# LabelEncoder which is specific to XGB 
le = LabelEncoder()
y_train = le.fit_transform(y_train)

#Model
classifier = xgb.XGBClassifier() 
#Fit
classifier.fit(X_over, y_over)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

## Undersampling Result

In [None]:
# LabelEncoder which is specific to XGB 
le = LabelEncoder()
y_train = le.fit_transform(y_train)

#Model
classifier = xgb.XGBClassifier() 
#Fit
classifier.fit(X_under, y_under )
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

#### After testing different combinations of sampling and classifiers, here are the results: 
###### 1. XGB Over
###### 2. XGB
###### 3. XGB Under
###### 4. LogisticRegression

# let's try a decision tree!

In [None]:
#Model
classifier = DecisionTreeClassifier()
#Fit
classifier.fit(X_over, y_over)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

# let's try a support vector classifier!!

In [None]:
#Model
classifier = SVC()
#Fit
classifier.fit(X_over, y_over)
#Predict
predictions = classifier.predict(X_test)
#evaluate 
y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))