### Importing required libraries.

The data is collected from yahoo finance on a daily basis which can be altered depending on the requirement . 

In [1]:
#Installing Yahoo Finance
#!pip install yfinance
#!pip install pandas_datareader
#!pip install scikeras
#!pip install imbalanced-learn
#!pip install xgboost

In [2]:
import pandas as pd
import talib as ta
import time, datetime
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import alpaca_trade_api as tradeapi
import os
import yfinance as yf
import pandas_datareader.data as pdr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.optimizers import SGD
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from itertools import product
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

2022-11-30 21:52:39.960790: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Importing Data from Yahoo Finance.

In [3]:
#Collecting desired ticker from user
ticker = input("Please enter your desired ticker")
start_time = datetime.datetime(1980,1,1)
end_time = datetime.datetime.now().date()

Please enter your desired ticker SPY


In [4]:
def get_data(ticker):
    df = pdr.get_data_yahoo(ticker, start=start_time, end=end_time)         
    return df

In [5]:
def compute_ema(df):
    df['EMA50'] = df['Close'].ewm(span=50, adjust=False).mean()
    #Setting adjust to False to specify that recursive calculation mode is required.
    df['EMA100'] = df['Close'].ewm(span=100, adjust=False).mean()
    #df['EMA150'] = EMA(df['Close'].values, timeperiod=150)
   # df['EMA200'] = EMA(df['Close'].values, timeperiod=200)
    return df


In [6]:
def computeMACD(df, n_fast, n_slow, n_smooth):
    data = df['Close']
    fastEMA = data.ewm(span=n_fast, min_periods=n_slow).mean()
    slowEMA = data.ewm(span=n_slow, min_periods=n_slow).mean()
    MACD = pd.Series(fastEMA-slowEMA, name ='MACD')
    df = df.join(MACD)
    return df

In [7]:
def compute_rsi_bb(df):
    df['upperBB'], df['middleBB'], df['lowerBB'] = ta.BBANDS(df['Close'].values, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
        # we will normalize RSI
    df['RSI'] = ta.RSI(df['Close'].values, timeperiod=14)
    df['nor_RSI'] = ta.RSI(df['Close'].values, timeperiod=14)/100.0
    return df
    

### Creating the feature variables.

In [8]:
def calculated_features(df):
    df['aboveEMA50'] = np.where(df['Close'] > df['EMA50'], 1, 0)
    df['aboveEMA100'] = np.where(df['Close'] > df['EMA100'], 1, 0)
    df['aboveupperBB'] = np.where(df['Close'] > df['upperBB'], 1, 0)
    df['belowlowerBB'] = np.where(df['Close'] < df['lowerBB'], 1, 0)
    df['oversoldRSI'] = np.where(df['nor_RSI'] < 0.30, 1, 0)
    df['overboughtRSI'] = np.where(df['nor_RSI'] > 0.70, 1, 0)
    return df

In [9]:
# create a new column in the DF based on the conditions
def defined_conditions(df):
    # List of conditions
    condition = [(df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==1) & (df['MACD']>0.5),
                 (df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==0) & (df['MACD']<0.5),
                 (df['aboveEMA50'] != df['aboveEMA100'])]
    # List of values to return
    choice  = [1,0,2]

    df['Buy/Sell'] = np.select(condition, choice)
    
    return df
#Considering it as a buy if the price is over 50 ema and 100 ema.
#Considering it as a Sell if the price is under 50 ema and 100 ema. 
#Further analysis is required if the price is over 50 ema and under ema 100 or vice versa.

# create a new column in the DF based on the conditions
def defined_conditions(df):
    # List of conditions
    condition = [(df['aboveupperBB'] == 0) &(df['oversoldRSI']==1),
                 (df['aboveupperBB'] == 1) & (df['overboughtRSI']==1)]
    # List of values to return
    choice  = [1,0]

    df['Buy/Sell'] = np.select(condition, choice)
    
    return df

#NDAQ
def defined_conditions(df):
    # List of conditions
    condition = [(df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==1) & (df['belowlowerBB'] == 0),
                 (df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==0) & (df['aboveupperBB'] == 1)]
    # List of values to return
    choice  = [1,0]

    df['Buy/Sell'] = np.select(condition, choice)
    
    return df

### Creating DF for Indices.

In [10]:


#Calling the get_data function definition. 
df = get_data(ticker)
df = df.reset_index()
df = df.set_index('Date')
#Calculating EMA 50 and 100
df = compute_ema(df)
# Computing RSI and Bolinger Bands
df = compute_rsi_bb(df)
#Calculating if the signal is to buy or sell.
df = calculated_features(df)
#Computing the MACD indicator
df =computeMACD(df, 12, 26, 9)
# Rules for placind a trade.
df = defined_conditions(df)


In [11]:
#df.tail(10)
df = df.dropna()
dataMACD = df[['MACD']].to_numpy()
minmaxscaler = MinMaxScaler()
df['norm_MACD'] = minmaxscaler.fit_transform(dataMACD)
df.describe()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close,EMA50,EMA100,upperBB,middleBB,...,nor_RSI,aboveEMA50,aboveEMA100,aboveupperBB,belowlowerBB,oversoldRSI,overboughtRSI,MACD,Buy/Sell,norm_MACD
count,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,...,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0,7489.0
mean,162.632282,160.569753,161.655044,161.655216,84840910.0,134.965753,160.533135,159.362678,166.426875,161.210244,...,0.542411,0.684471,0.717452,0.053545,0.053812,0.015356,0.067032,0.31888,0.648818,0.728055
std,97.132332,96.021127,96.606055,96.61388,93929180.0,102.81326,95.741841,94.786778,99.644558,96.274012,...,0.111669,0.464757,0.450268,0.225133,0.225662,0.122972,0.250093,2.154964,0.649251,0.065086
min,43.875,43.28125,43.34375,43.40625,5200.0,25.145924,44.281551,44.144006,44.814379,44.167187,...,0.167009,0.0,0.0,0.0,0.0,0.0,0.0,-23.786753,0.0,0.0
25%,104.875,103.150002,103.90625,104.050003,9339700.0,71.314568,104.038576,103.020707,107.999051,104.035156,...,0.462702,0.0,0.0,0.0,0.0,0.0,0.0,-0.340062,0.0,0.708153
50%,131.25,129.550003,130.5,130.46875,61659900.0,94.020088,130.038912,128.890409,134.63111,130.582499,...,0.54894,1.0,1.0,0.0,0.0,0.0,0.0,0.452521,1.0,0.732092
75%,205.979996,203.910004,204.710007,204.910004,118939000.0,179.224457,203.205243,202.360367,210.414113,204.621,...,0.624392,1.0,1.0,0.0,0.0,0.0,0.0,1.333886,1.0,0.758711
max,479.980011,476.059998,479.220001,477.709991,871026300.0,472.24231,465.032744,456.76181,490.640946,469.996501,...,0.870309,1.0,1.0,1.0,1.0,1.0,1.0,9.322867,2.0,1.0


### Seperating Labels and Features from the Data frame and splitting the data

In [47]:
df = df.dropna()
X = df.drop(columns =['Buy/Sell'])
#Reviewing features Data Frame.
X.head()

Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close', 'EMA50',
       'EMA100', 'upperBB', 'middleBB', 'lowerBB', 'RSI', 'nor_RSI',
       'aboveEMA50', 'aboveEMA100', 'aboveupperBB', 'belowlowerBB',
       'oversoldRSI', 'overboughtRSI', 'MACD', 'norm_MACD'],
      dtype='object')

In [13]:
y = df['Buy/Sell']
#Reviewing Label.
y.head(5)
y.value_counts()
## Found class Imbalances in the data set. 
# Suggested to use SMOTE Upsampling to balane the class.


1    3409
0    3355
2     725
Name: Buy/Sell, dtype: int64

In [14]:
# Split the data into training and testing datasets
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state =1,stratify = y)

### Passing the Data to NN to establish a baseline accuracy.

#### Normalising the data using Standard Scaler

In [15]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# encoding class labels as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)

#### Adding Layers to Neural Network

In [20]:
def create_model():
	# create model
	model = Sequential()
	model.add(Dropout(0.2, input_shape=(21,)))
	model.add(Dense(10, activation='relu', kernel_constraint=MaxNorm(3)))
	model.add(Dense(5, activation='relu', kernel_constraint=MaxNorm(3)))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	sgd = SGD(learning_rate=0.1, momentum=0.9)
	model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
	return model
 

In [21]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_model, epochs=30, batch_size=1, verbose=1)))
pipeline = Pipeline(estimators)

In [22]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)

In [23]:
results = cross_val_score(pipeline, X, y, cv=kfold)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

In [24]:
print("Visible: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### Additional tune up and Learning rate and Epochs needed. 

Visible: 44.80% (0.01%)


### Creating Ensemble

In [52]:
from sklearn.metrics import log_loss
clf1 = LogisticRegression(random_state=1)
#clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
clf2 = XGBClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('xgb', clf2), ('gnb', clf3)],voting='hard')


eclf.fit(X_train, y_train)
 
# predicting the output on the test dataset
pred_final = eclf.predict(X_test)
 
# printing log loss between actual and predicted value
print("The accuracy of the model in percentage is",(accuracy_score(y_test, pred_final)*100))

The accuracy of the model in percentage is 85.42445274959958


In [27]:
for clf, label in zip([clf1, clf2, clf3,eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.45 (+/- 0.00) [Logistic Regression]
Accuracy: 1.00 (+/- 0.00) [Random Forest]
Accuracy: 0.50 (+/- 0.02) [Naive Bayes]
Accuracy: 0.85 (+/- 0.01) [Ensemble]


In [30]:
from sklearn.metrics import classification_report
testing_signal_predictions = eclf.predict(X_test)
 # Evaluate the model's ability to predict the trading signal for the testing data
ensemble_classification_report = classification_report(y_test, testing_signal_predictions)
print(ensemble_classification_report)

              precision    recall  f1-score   support

           0       0.75      1.00      0.86       839
           1       1.00      0.89      0.94       853
           2       0.00      0.00      0.00       181

    accuracy                           0.85      1873
   macro avg       0.58      0.63      0.60      1873
weighted avg       0.79      0.85      0.81      1873



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
 # Create a new empty predictions DataFrame using code provided below.
predictions_df = pd.DataFrame(index=X_test.index)
predictions_df["predicted_signal"] = testing_signal_predictions
predictions_df["actual_returns"] = df["Close"].pct_change()
predictions_df["trading_algorithm_returns"] = predictions_df["actual_returns"] * predictions_df["predicted_signal"]
predictions_df.tail(20)

Unnamed: 0_level_0,predicted_signal,actual_returns,trading_algorithm_returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-02-16,1,-0.000851,-0.000851
2022-04-29,0,-0.036956,-0.0
2002-06-11,0,-0.017158,-0.0
2008-03-19,0,-0.02477,-0.0
2017-06-16,1,-0.004636,-0.004636
2009-02-05,0,0.014881,0.0
2021-03-29,1,-0.000505,-0.000505
2014-03-24,1,-0.004135,-0.004135
1995-10-12,0,0.006188,0.0
2013-05-20,1,-6e-05,-6e-05


In [42]:
import hvplot.pandas
(((1 + predictions_df[["actual_returns"]]).cumprod()).hvplot(label="Actual Returns", title = ('Cumulative Product Returns of Actual vs Trading Algorithm Returns'))) * (((1 + predictions_df[["trading_algorithm_returns"]]).cumprod()).hvplot(label="Trading Algorithm Returns"))

def predict_timeseries(df):
    for i in range(len(df)):
        prediction = eclf.predict(X_test)
        #####print('prediction', prediction)
        model_df['Buy'][i] = prediction
    print(df.head())    
        
    return df

In [167]:
from matplotlib.axis import Axis
from matplotlib.widgets import Slider, Button, RadioButtons 
def plot_prediction(df, ticker):
    plt.title('Backtesting of the model')
    plt.plot(df.index, df['Close'], label='Close', alpha=0.2)
    plt.plot(df.index, df['EMA50'], label='EMA50', alpha=0.2)
    plt.plot(df.index, df['EMA100'], label='EMA100', alpha=0.2)
    plt.plot(df.index, df['upperBB'], label='Upper BB', alpha=0.3)
    plt.plot(df.index, df['lowerBB'], label='Lower BB', alpha=0.3)
    plt.plot(df.index, df['RSI'], label='RSI', alpha=0.2)
    plt.plot(df.index, df['MACD'], label='MACD', alpha=0.2)


    plt.scatter(df.index, df['Buy/Sell']*df['Close'], label='Buy', marker='^', alpha=0.5)
#    plt.scatter(df.index, df['Buy/Sell']*df['Close'], label='Sell', marker='v', alpha=0.5)
    plt.legend()
    plt.show()

    return None  

In [168]:
##Creating a new DF

In [171]:
plot_prediction(df, ticker)

In [170]:
def plot_prediction(df, ticker):
    df_plot = df.copy()
    close_plot =df_plot.hvplot(x='Date',y='Close', label='Close', alpha=0.2)
    ema50_plot = df_plot.hvplot(x='Date',y= 'EMA50', label='EMA50', alpha=0.2)
    ema100_plot = df_plot.hvplot(x='Date',y='EMA100', label='EMA100', alpha=0.2)
    upperbb_plot =df_plot.hvplot(x='Date',y='upperBB', label='Upper BB', alpha=0.3)
    lowerbb_plot=df_plot.hvplot(x='Date',y='lowerBB', label='Lower BB', alpha=0.3)
    rsi_plot=df_plot.hvplot(x='Date',y='RSI', label='RSI', alpha=0.2)
    macd_plot =df_plot.hvplot(x='Date',y='MACD', label='MACD', alpha=0.2)
    scatter_plot = df_plot.hvplot.scatter(x='Date', y=['Buy/Sell','Close'], label='Buy', color ='black',marker='^', alpha=0.5)
    a = (close_plot*ema50_plot *ema100_plot*upperbb_plot*lowerbb_plot*rsi_plot*macd_plot*scatter_plot)

    return a  
## Calculate Boolean Indexing in HVPLOT.