In [1]:
#%pip install yahoo_fin

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import alpaca_trade_api as tradeapi
from yahoo_fin.stock_info import get_data

%matplotlib inline

In [3]:
hist=get_data('DIS', index_as_date=False)

hist.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,1970-03-25,0.734737,0.757216,0.734737,0.750151,0.497616,2627471,DIS
1,1970-03-26,0.750151,0.75272,0.740517,0.75272,0.499319,2082514,DIS
2,1970-03-30,0.754005,0.765565,0.754005,0.762996,0.506136,1109377,DIS
3,1970-03-31,0.762996,0.764923,0.750151,0.757858,0.502728,1868424,DIS
4,1970-04-01,0.757858,0.77841,0.747582,0.775841,0.514657,2627471,DIS


In [4]:
# Remove the adjusted close column and rename our dataframe as "prices"

prices = hist.drop(['adjclose'], axis=1)
prices.head()


Unnamed: 0,date,open,high,low,close,volume,ticker
0,1970-03-25,0.734737,0.757216,0.734737,0.750151,2627471,DIS
1,1970-03-26,0.750151,0.75272,0.740517,0.75272,2082514,DIS
2,1970-03-30,0.754005,0.765565,0.754005,0.762996,1109377,DIS
3,1970-03-31,0.762996,0.764923,0.750151,0.757858,1868424,DIS
4,1970-04-01,0.757858,0.77841,0.747582,0.775841,2627471,DIS


In [5]:
# Add difference between closing price and opening price
# NOTE: O-to-C is the length of the candle's body

prices['O-to-C'] = prices['close'] - prices['open']

# Add 20-Day moving average for Open-to-Close column 

prices['OC-20D-Mean'] = prices['O-to-C'].rolling(20).mean()

# Calculate the % change of the current day's O-to-C relative to the moving average

prices['OC-%-from-20D-Mean'] = 100*(prices['O-to-C'] - prices['OC-20D-Mean'])/prices['OC-20D-Mean']

# Get the maximum OC compared to the recent 10 candles (including the current candle)

prices['MaxOC_Prev10'] = prices['O-to-C'].rolling(10).max()

In [6]:
# Add 20-Day moving average for volume 

prices['Volume-20D-Mean'] = prices['volume'].rolling(20).mean()

# Calculate the % change of the current volume relative to the moving average

prices['Volume-%-from-20D-Mean'] = 100*(prices['volume'] - prices['Volume-20D-Mean'])/prices['Volume-20D-Mean']

In [7]:
# Print the columns for easy copy-pasting
prices.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker', 'O-to-C',
       'OC-20D-Mean', 'OC-%-from-20D-Mean', 'MaxOC_Prev10', 'Volume-20D-Mean',
       'Volume-%-from-20D-Mean'],
      dtype='object')

In [8]:
# Rearrange the columns for our dataframe

prices = prices[['ticker', 'date', 'open', 'high', 'low', 'close', 
                 'O-to-C', 'OC-20D-Mean', 'volume', 'Volume-20D-Mean', 
                 'MaxOC_Prev10', 'Volume-%-from-20D-Mean', 'OC-%-from-20D-Mean', 
                ]]

# Show the 10 most recent rows

prices.tail(10)

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
13562,DIS,2024-01-02,90.099998,91.480003,89.730003,90.709999,0.610001,0.092499,10587600,11012255.0,1.290001,-3.856204,559.466102
13563,DIS,2024-01-03,90.230003,92.080002,90.0,91.650002,1.419998,0.157999,11929800,10978605.0,1.419998,8.664079,798.738486
13564,DIS,2024-01-04,91.940002,91.940002,90.0,90.559998,-1.380005,0.129999,12087400,10965125.0,1.419998,10.234949,-1161.552186
13565,DIS,2024-01-05,90.410004,91.32,90.360001,90.900002,0.489998,0.134999,9084400,10896245.0,1.419998,-16.62816,262.964868
13566,DIS,2024-01-08,91.550003,91.940002,91.0,91.550003,0.0,0.107499,11103700,10938195.0,1.419998,1.513092,-100.0
13567,DIS,2024-01-09,91.050003,91.099998,89.599998,89.669998,-1.380005,0.000499,11255100,11026190.0,1.419998,2.076057,-276886.534047
13568,DIS,2024-01-10,89.82,89.82,88.879997,89.290001,-0.529999,0.000499,15091600,11240220.0,1.419998,34.264276,-106320.183486
13569,DIS,2024-01-11,89.389999,89.779999,88.690002,89.449997,0.059998,0.055999,9642200,11061410.0,1.419998,-12.830281,7.140424
13570,DIS,2024-01-12,89.650002,90.830002,89.650002,90.349998,0.699997,0.007999,11912800,11148390.0,1.419998,6.856685,8651.430752
13571,DIS,2024-01-16,90.290001,93.089996,89.455002,93.050003,2.760002,0.109499,16747813,11363815.65,2.760002,47.378429,2420.58221


In [9]:
# Remove rows with null values

prices = prices.dropna()
prices.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13551 entries, 19 to 13571
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ticker                  13551 non-null  object        
 1   date                    13551 non-null  datetime64[ns]
 2   open                    13551 non-null  float64       
 3   high                    13551 non-null  float64       
 4   low                     13551 non-null  float64       
 5   close                   13551 non-null  float64       
 6   O-to-C                  13551 non-null  float64       
 7   OC-20D-Mean             13551 non-null  float64       
 8   volume                  13551 non-null  int64         
 9   Volume-20D-Mean         13551 non-null  float64       
 10  MaxOC_Prev10            13551 non-null  float64       
 11  Volume-%-from-20D-Mean  13551 non-null  float64       
 12  OC-%-from-20D-Mean      13551 non-null  float64   

In [10]:
#Isolating green candles
prices['O-to-C'] >= 0.0

19       False
20       False
21       False
22       False
23        True
         ...  
13567    False
13568    False
13569     True
13570     True
13571     True
Name: O-to-C, Length: 13551, dtype: bool

In [11]:
#Separating bodies that are longest in 10 days
prices['O-to-C'] == prices['MaxOC_Prev10']

19       False
20       False
21       False
22       False
23        True
         ...  
13567    False
13568    False
13569    False
13570    False
13571     True
Length: 13551, dtype: bool

In [12]:
#Seeing if body is at 100% longer than average of previous 20 candles
prices['OC-%-from-20D-Mean'] >= 100.0

19        True
20        True
21        True
22        True
23       False
         ...  
13567    False
13568    False
13569    False
13570     True
13571     True
Name: OC-%-from-20D-Mean, Length: 13551, dtype: bool

In [13]:
#Seeing if volume is at least 50% higher than average of previous 20 candles
prices['Volume-%-from-20D-Mean'] >= 50.0

19        True
20        True
21        True
22        True
23        True
         ...  
13567    False
13568    False
13569    False
13570    False
13571    False
Name: Volume-%-from-20D-Mean, Length: 13551, dtype: bool

In [14]:
#Making a separate dataset for breakouts

condition = (prices['O-to-C'] >= 0.0) & (prices['O-to-C'] == prices['MaxOC_Prev10']) & (prices['OC-%-from-20D-Mean'] >= 100.0) & (prices['Volume-%-from-20D-Mean'] >= 50.0) 

breakouts = prices[condition]

breakouts

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
160,DIS,1970-11-09,0.657667,0.692991,0.657667,0.685926,0.028259,0.003821,8875013,2111708.15,0.028259,320.276495,639.483478
169,DIS,1970-11-20,0.681430,0.719323,0.681430,0.714827,0.033397,0.005106,5644197,2952498.80,0.033397,91.166784,554.073741
237,DIS,1971-03-02,0.964663,1.008337,0.964663,1.001914,0.037251,0.005523,6198885,3335428.30,0.037251,85.849745,574.421290
270,DIS,1971-04-19,1.082838,1.127796,1.082838,1.114951,0.032113,0.001349,3396249,1900050.65,0.032113,78.745182,2280.949435
275,DIS,1971-04-26,1.172753,1.226702,1.172753,1.226702,0.053949,0.005523,3483832,1909295.50,0.053949,82.466884,876.744537
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12833,DIS,2021-02-08,183.850006,190.639999,183.300003,190.000000,6.149994,0.135499,16085700,9846580.00,6.149994,63.363320,4438.780750
12842,DIS,2021-02-22,181.740005,194.020004,181.529999,191.759995,10.019989,0.724001,18799600,12451720.00,10.019989,50.979945,1283.975333
12852,DIS,2021-03-08,197.309998,203.020004,193.789993,201.910004,4.600006,0.321000,25093200,14514370.00,4.600006,72.885217,1333.024117
12939,DIS,2021-07-12,177.710007,184.990005,177.369995,184.380005,6.669998,0.356001,21835500,9597300.00,6.669998,127.517114,1773.588246


In [15]:
prices['NewColumn']=np.where(condition, 1,0)

In [44]:
filtered_df=prices[prices['NewColumn']==1]
filtered_df.head()

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean,NewColumn
160,DIS,1970-11-09,0.657667,0.692991,0.657667,0.685926,0.028259,0.003821,8875013,2111708.15,0.028259,320.276495,639.483478,1
169,DIS,1970-11-20,0.68143,0.719323,0.68143,0.714827,0.033397,0.005106,5644197,2952498.8,0.033397,91.166784,554.073741,1
237,DIS,1971-03-02,0.964663,1.008337,0.964663,1.001914,0.037251,0.005523,6198885,3335428.3,0.037251,85.849745,574.42129,1
270,DIS,1971-04-19,1.082838,1.127796,1.082838,1.114951,0.032113,0.001349,3396249,1900050.65,0.032113,78.745182,2280.949435,1
275,DIS,1971-04-26,1.172753,1.226702,1.172753,1.226702,0.053949,0.005523,3483832,1909295.5,0.053949,82.466884,876.744537,1


In [45]:
#Using isna fuction to find "infinity" values
filtered_df.isna()

Unnamed: 0,ticker,date,open,high,low,close,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean,NewColumn
160,False,False,False,False,False,False,False,False,False,False,False,False,False,False
169,False,False,False,False,False,False,False,False,False,False,False,False,False,False
237,False,False,False,False,False,False,False,False,False,False,False,False,False,False
270,False,False,False,False,False,False,False,False,False,False,False,False,False,False
275,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12833,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12842,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12852,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12939,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [46]:
import pandas as pd
import numpy as np

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

In [18]:
#Making variables for X and y
X=filtered_df[['O-to-C','OC-20D-Mean','volume','Volume-20D-Mean','MaxOC_Prev10','Volume-%-from-20D-Mean','OC-%-from-20D-Mean']]
y=filtered_df['NewColumn']

In [47]:
clean_dataset(X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


Unnamed: 0,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
160,0.028259,0.003821,8875013.0,2111708.15,0.028259,320.276495,639.483478
169,0.033397,0.005106,5644197.0,2952498.80,0.033397,91.166784,554.073741
237,0.037251,0.005523,6198885.0,3335428.30,0.037251,85.849745,574.421290
270,0.032113,0.001349,3396249.0,1900050.65,0.032113,78.745182,2280.949435
275,0.053949,0.005523,3483832.0,1909295.50,0.053949,82.466884,876.744537
...,...,...,...,...,...,...,...
12833,6.149994,0.135499,16085700.0,9846580.00,6.149994,63.363320,4438.780750
12842,10.019989,0.724001,18799600.0,12451720.00,10.019989,50.979945,1283.975333
12852,4.600006,0.321000,25093200.0,14514370.00,4.600006,72.885217,1333.024117
12939,6.669998,0.356001,21835500.0,9597300.00,6.669998,127.517114,1773.588246


In [48]:
#Using isna fuction to find "infinity" values
X.isna()

Unnamed: 0,O-to-C,OC-20D-Mean,volume,Volume-20D-Mean,MaxOC_Prev10,Volume-%-from-20D-Mean,OC-%-from-20D-Mean
160,False,False,False,False,False,False,False
169,False,False,False,False,False,False,False
237,False,False,False,False,False,False,False
270,False,False,False,False,False,False,False
275,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
12833,False,False,False,False,False,False,False
12842,False,False,False,False,False,False,False
12852,False,False,False,False,False,False,False
12939,False,False,False,False,False,False,False


In [49]:
#Using isnan function to find "infinity" values
np.isnan(X.any())

O-to-C                    False
OC-20D-Mean               False
volume                    False
Volume-20D-Mean           False
MaxOC_Prev10              False
Volume-%-from-20D-Mean    False
OC-%-from-20D-Mean        False
dtype: bool

In [50]:
#Using isfinite fuction to find "infinity" values
np.isfinite(X.all())

O-to-C                    True
OC-20D-Mean               True
volume                    True
Volume-20D-Mean           True
MaxOC_Prev10              True
Volume-%-from-20D-Mean    True
OC-%-from-20D-Mean        True
dtype: bool

In [88]:
y.value_counts()

NewColumn
1    202
Name: count, dtype: int64

In [51]:
#Still trying to remove infinite values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(999, inplace=True)


In [52]:
#Importing library to split training and testing data
from sklearn.model_selection import train_test_split

In [53]:
# Create training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [58]:
#Importing SVM
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [84]:
import numpy as np
unique_labels = np.unique(y)
print("Unique Labels:", unique_labels)

Unique Labels: [1]


In [60]:
#Defining the classifier
classifier = xgb.XGBClassifier()


In [85]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [86]:
classifier.fit(X_train,y_train)

In [87]:
X_scaler = StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [81]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [83]:

y_pred = classifier.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00      51.0

    accuracy                           0.00      51.0
   macro avg       0.00      0.00      0.00      51.0
weighted avg       0.00      0.00      0.00      51.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
xgb_acc = accuracy_score(y_test, y_pred)
xgb_acc

NameError: name 'accuracy_score' is not defined