### Importing required libraries.

The data is collected from yahoo finance on a daily basis which can be altered depending on the requirement . 

In [1]:
#Installing Yahoo Finance
#!pip install yfinance
#!pip install pandas_datareader
#!pip install scikeras
#!pip install imbalanced-learn
#!pip install xgboost
#!pip install TA-Lib

In [126]:
import pandas as pd
import talib as ta
import time, datetime
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import alpaca_trade_api as tradeapi
from dotenv import load_dotenv
import os
import yfinance as yf
import pandas_datareader.data as pdr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.optimizers import SGD
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from itertools import product
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [127]:
# Load .env enviroment variables
load_dotenv()

True

In [159]:
# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Create the Alpaca API object
api = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version = "v2"
)

In [182]:
# Format current date as ISO format
start_date = pd.Timestamp("2014-01-01", tz="America/New_York").isoformat()
end_date = pd.Timestamp("2022-11-29", tz="America/New_York").isoformat()

# Set the tickers
ticker = ["SPY"]

# Set timeframe to "1Day" for Alpaca API
timeframe = "4Hour"

# Get current closing prices for SPY and AGG
df = api.get_bars(
    ticker,
    timeframe,
    start=start_date,
    end=end_date
).df

# Preview DataFrame
df.columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Trade_count', 'vwap', 'Symbol']
df.drop(["Trade_count", "vwap", "Symbol"], inplace = True, axis = 1)
df.head()


Unnamed: 0_level_0,Open,High,Low,Close,Volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-12-01 08:00:00+00:00,209.43,209.65,209.07,209.44,129269
2015-12-01 12:00:00+00:00,209.45,210.39,209.11,209.89,33764235
2015-12-01 16:00:00+00:00,209.9,210.2,209.47,210.045,35820667
2015-12-01 20:00:00+00:00,210.05,210.82,210.025,210.5,27992916
2015-12-02 00:00:00+00:00,210.51,210.63,210.49,210.62,80281


### Importing Data from Yahoo Finance.

#Collecting desired ticker from user
ticker = input("Please enter your desired ticker")
start_time = datetime.datetime(1980,1,1)
end_time = datetime.datetime.now().date()

datetime.datetime.strptime(
        my_obj['dttm_utc'],
        '%Y-%m-%s %h:%m:%s'
    ).isoformat()

def get_data(ticker):
    df = pdr.get_data_yahoo(ticker, start=start_time, end=end_time)         
    return df

In [183]:
def compute_ema(df):
    df['EMA50'] = df['Close'].ewm(span=50, adjust=False).mean()
    #Setting adjust to False to specify that recursive calculation mode is required.
    df['EMA100'] = df['Close'].ewm(span=100, adjust=False).mean()
    #df['EMA150'] = EMA(df['Close'].values, timeperiod=150)
   # df['EMA200'] = EMA(df['Close'].values, timeperiod=200)
    return df


In [184]:
def computeMACD(df, n_fast, n_slow, n_smooth):
    data = df['Close']
    fastEMA = data.ewm(span=n_fast, min_periods=n_slow).mean()
    slowEMA = data.ewm(span=n_slow, min_periods=n_slow).mean()
    MACD = pd.Series(fastEMA-slowEMA, name ='MACD')
    df = df.join(MACD)
    return df

In [185]:
def compute_rsi_bb(df):
    df['upperBB'], df['middleBB'], df['lowerBB'] = ta.BBANDS(df['Close'].values, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
        # we will normalize RSI
    df['RSI'] = ta.RSI(df['Close'].values, timeperiod=14)
    df['nor_RSI'] = ta.RSI(df['Close'].values, timeperiod=14)/100.0
    return df
    

### Creating the feature variables.

In [186]:
def calculated_features(df):
    df['aboveEMA50'] = np.where(df['Close'] > df['EMA50'], 1, 0)
    df['aboveEMA100'] = np.where(df['Close'] > df['EMA100'], 1, 0)
    df['aboveupperBB'] = np.where(df['Close'] > df['upperBB'], 1, 0)
    df['belowlowerBB'] = np.where(df['Close'] < df['lowerBB'], 1, 0)
    df['oversoldRSI'] = np.where(df['nor_RSI'] < 0.30, 1, 0)
    df['overboughtRSI'] = np.where(df['nor_RSI'] > 0.70, 1, 0)
    return df

In [187]:
# create a new column in the DF based on the conditions
def defined_conditions(df):
    # List of conditions
    condition = [(df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==1) & (df['MACD']>0.5),
                 (df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==0) & (df['MACD']<0.5),
                 (df['aboveEMA50'] != df['aboveEMA100'])]
    # List of values to return
    choice  = [1,0,2]

    df['Buy/Sell'] = np.select(condition, choice)
    
    return df
#Considering it as a buy if the price is over 50 ema and 100 ema.
#Considering it as a Sell if the price is under 50 ema and 100 ema. 
#Further analysis is required if the price is over 50 ema and under ema 100 or vice versa.


# create a new column in the DF based on the conditions
def defined_conditions(df):
    # List of conditions
    condition = [(df['aboveupperBB'] == 0) &(df['oversoldRSI']==1),
                 (df['aboveupperBB'] == 1) & (df['overboughtRSI']==1)]
    # List of values to return
    choice  = [1,0]

    df['Buy/Sell'] = np.select(condition, choice)
    
    return df

#NDAQ
def defined_conditions(df):
    # List of conditions
    condition = [(df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==1) & (df['belowlowerBB'] == 0),
                 (df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==0) & (df['aboveupperBB'] == 1)]
    # List of values to return
    choice  = [1,0]

    df['Buy/Sell'] = np.select(condition, choice)
    
    return df

### Creating DF for multiple stocks and Indices.

In [188]:


#Calling the get_data function definition. 
#df = get_data(ticker)
df = df.reset_index()
df = df.set_index('timestamp')
#Calculating EMA 50 and 100
df = compute_ema(df)
# Computing RSI and Bolinger Bands
df = compute_rsi_bb(df)
#Calculating if the signal is to buy or sell.
df = calculated_features(df)
#Computing the MACD indicator
df =computeMACD(df, 12, 26, 9)
# Rules for placind a trade.
df = defined_conditions(df)


for ticker in tickers:
    t_df = get_data(ticker)
    t_df = compute_ema(t_df)
    t_df = compute_rsi_bb(t_df)
    t_df = calculated_features(t_df)
    t_df = computeMACD(t_df, 12, 26, 9)
    t_df = defined_conditions(t_df)
    df = df.append(t_df, ignore_index=True)

In [189]:
#df.tail(10)
df = df.dropna()
dataMACD = df[['MACD']].to_numpy()
minmaxscaler = MinMaxScaler()
df['norm_MACD'] = minmaxscaler.fit_transform(dataMACD)
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,EMA50,EMA100,upperBB,middleBB,lowerBB,...,nor_RSI,aboveEMA50,aboveEMA100,aboveupperBB,belowlowerBB,oversoldRSI,overboughtRSI,MACD,Buy/Sell,norm_MACD
count,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,...,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0
mean,313.907423,314.881304,312.873787,313.923055,19374030.0,313.34899,312.795283,318.470726,313.695362,308.919998,...,0.536321,0.660616,0.684402,0.061467,0.054206,0.040436,0.085503,0.166389,0.627191,0.662494
std,80.715025,80.954413,80.454328,80.71527,21708790.0,80.579671,80.483552,82.269876,80.68384,79.258058,...,0.126006,0.47353,0.464783,0.2402,0.226439,0.196991,0.279647,1.880827,0.652045,0.103592
min,181.31,182.79,180.91,181.33,1.0,186.922251,189.096363,187.470893,184.593,180.008422,...,0.100053,0.0,0.0,0.0,0.0,0.0,0.0,-11.861964,0.0,0.0
25%,249.077525,249.82875,247.6225,249.03,341806.0,249.596491,250.294034,252.662135,248.643379,245.398989,...,0.449881,0.0,0.0,0.0,0.0,0.0,0.0,-0.443503,0.0,0.628902
50%,290.24,291.145,289.36,290.175,15483390.0,289.955914,288.80072,294.501974,289.920725,286.638866,...,0.546635,1.0,1.0,0.0,0.0,0.0,0.0,0.397388,1.0,0.675216
75%,386.41,388.15,384.85,386.7175,29955770.0,385.157946,385.898672,393.068886,385.342663,377.899914,...,0.629952,1.0,1.0,0.0,0.0,0.0,0.0,1.120072,1.0,0.71502
max,479.4,479.98,478.69,479.41,224715900.0,474.892415,471.658426,482.909488,477.2475,475.666537,...,0.878286,1.0,1.0,1.0,1.0,1.0,1.0,6.294216,2.0,1.0


### Seperating Labels and Features from the Data frame and splitting the data

In [190]:
df = df.dropna()
X = df.drop(columns =['Buy/Sell'])
#Reviewing features Data Frame.
X.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,EMA50,EMA100,upperBB,middleBB,lowerBB,RSI,nor_RSI,aboveEMA50,aboveEMA100,aboveupperBB,belowlowerBB,oversoldRSI,overboughtRSI,MACD,norm_MACD
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2015-12-08 08:00:00+00:00,207.96,207.96,207.29,207.38,116140,208.885729,209.124368,211.092561,208.312755,205.532949,42.88345,0.428835,0,0,0,0,0,0,-0.152043,0.644955
2015-12-08 12:00:00+00:00,207.37,208.14,205.78,207.21,39118172,208.820014,209.086459,210.752722,208.143255,205.533788,42.168565,0.421686,0,0,0,0,0,0,-0.217537,0.641348
2015-12-08 16:00:00+00:00,207.215,208.04,206.48,207.1472,39984002,208.754414,209.048058,210.723107,208.081115,205.439123,41.890747,0.418907,0,0,0,0,0,0,-0.270559,0.638427
2015-12-08 20:00:00+00:00,207.14,208.289,206.55,207.03,24001074,208.68679,209.008097,210.641768,207.988615,205.335462,41.343316,0.413433,0,0,0,0,0,0,-0.317231,0.635857
2015-12-09 00:00:00+00:00,207.0,207.25,206.93,207.2,152084,208.628484,208.972293,210.538337,207.901615,205.264893,42.51675,0.425167,0,0,0,0,0,0,-0.337638,0.634733


In [191]:
y = df['Buy/Sell']
#Reviewing Label.
y.head(5)
y.value_counts()
## Found class Imbalances in the data set. 
# Suggested to use SMOTE Upsampling to balane the class.


0    3742
1    3482
2     764
Name: Buy/Sell, dtype: int64

In [192]:
# Split the data into training and testing datasets
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state =1,stratify = y)

### Passing the Data to NN to establish a baseline accuracy.

#### Normalising the data using Standard Scaler

In [193]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# encoding class labels as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)

#### Adding Layers to Neural Network

In [201]:
def create_model():
	# create model
	model = Sequential()
	model.add(Dropout(0.2, input_shape=(20,)))
	model.add(Dense(11, activation='relu', kernel_constraint=MaxNorm(3)))
	model.add(Dense(6, activation='relu', kernel_constraint=MaxNorm(3)))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	sgd = SGD(learning_rate=0.08, momentum=0.9)
	model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
	return model
 

In [202]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_model, epochs=30, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)

In [203]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)

In [204]:
results = cross_val_score(pipeline, X, y, cv=kfold)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

In [198]:
print("Visible: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### Additional tune up and Learning rate and Epochs needed. 

Visible: nan% (nan%)


### Creating Ensemble

In [199]:
from sklearn.metrics import log_loss
clf1 = LogisticRegression(random_state=1)
#clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
clf2 = XGBClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('xgb', clf2), ('gnb', clf3)],voting='hard')


eclf.fit(X_train, y_train)
 
# predicting the output on the test dataset
pred_final = eclf.predict(X_test)
 
# printing log loss between actual and predicted value
print(accuracy_score(y_test, pred_final))

0.8382573860791187


In [200]:
for clf, label in zip([clf1, clf2, clf3,eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.47 (+/- 0.00) [Logistic Regression]
Accuracy: 1.00 (+/- 0.00) [Random Forest]
Accuracy: 0.50 (+/- 0.02) [Naive Bayes]
Accuracy: 0.84 (+/- 0.01) [Ensemble]



-> Back TEsting
-> Add few more condition for the rest of the indices