### Importing required libraries.

The data is collected from yahoo finance on a daily basis which can be altered depending on the requirement . 

In [1]:
#Installing Yahoo Finance
!pip install yfinance
!pip install pandas_datareader
!pip install scikeras
!pip install imbalanced-learn
!pip install xgboost



In [2]:
import pandas as pd
import time, datetime
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import alpaca_trade_api as tradeapi
import os
import yfinance as yf
import pandas_datareader.data as pdr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.optimizers import SGD
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from itertools import product
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

### Importing Data from Yahoo Finance.

In [3]:
#Collecting desired ticker from user
ticker = input("Please enter your desired ticker")
start_time = datetime.datetime(1980,1,1)
end_time = datetime.datetime.now().date()

Please enter your desired ticker SPY


In [4]:
def get_data(ticker):
    df = pdr.get_data_yahoo(ticker, start=start_time, end=end_time)         
    return df

In [5]:
def compute_ema(df):
    df['EMA50'] = df['Close'].ewm(span=50, adjust=False).mean()
    #Setting adjust to False to specify that recursive calculation mode is required.
    df['EMA100'] = df['Close'].ewm(span=100, adjust=False).mean()
    #df['EMA150'] = EMA(df['Close'].values, timeperiod=150)
   # df['EMA200'] = EMA(df['Close'].values, timeperiod=200)
    return df


In [6]:
#Computing the MACD indicator
#df =computeMACD(df, 12, 26, 9)
#The values can be altered along with the model. 
#The values are normalised between -1 to 1 which is not suitable for the current model
#As we are looking for the intersection of price and MACD.

### Creating the feature variables.

In [7]:
def calculated_features(df):
    df['aboveEMA50'] = np.where(df['Close'] > df['EMA50'], 1, 0)
    df['aboveEMA100'] = np.where(df['Close'] > df['EMA100'], 1, 0)
    return df

In [8]:
# create a new column in the DF based on the conditions
def defined_conditions(df):
    # List of conditions
    condition = [(df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==1),
                 (df['aboveEMA50'] == df['aboveEMA100']) & (df['aboveEMA50'] ==0),
                 (df['aboveEMA50'] != df['aboveEMA100'])]
    # List of values to return
    choice  = [1,0,2]

    df['Buy/Sell'] = np.select(condition, choice, "ERROR")
    
    return df
#Considering it as a buy if the price is over 50 ema and 100 ema.
#Considering it as a Sell if the price is under 50 ema and 100 ema. 
#Further analysis is required if the price is over 50 ema and under ema 100 or vice versa.

### Creating DF for multiple stocks and Indices.

In [9]:
# Requesting the stock Data from the user. 
#tickers = ['AAPL','GOOG','AMD','AMZN']

In [10]:


#Calling the get_data function definition. 
df = get_data(ticker)
df = df.reset_index()
df = df.set_index('Date')
#Calculating EMA 50 and 100
df = compute_ema(df)
#Calculating if the signal is to buy or sell.
df = calculated_features(df)
# Rules for placind a trade.
df = defined_conditions(df)


### Seperating Labels and Features from the Data frame and splitting the data

In [11]:
X = df.drop(columns =['Buy/Sell'])
#Reviewing features Data Frame.
X.head(5)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,EMA50,EMA100,aboveEMA50,aboveEMA100
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1993-01-29,43.96875,43.75,43.96875,43.9375,1003200.0,25.33403,43.9375,43.9375,0,0
1993-02-01,44.25,43.96875,43.96875,44.25,480500.0,25.514206,43.949755,43.943688,1,1
1993-02-02,44.375,44.125,44.21875,44.34375,201300.0,25.568258,43.965206,43.95161,1,1
1993-02-03,44.84375,44.375,44.40625,44.8125,529400.0,25.838543,43.998433,43.968657,1,1
1993-02-04,45.09375,44.46875,44.96875,45.0,531500.0,25.946659,44.03771,43.98908,1,1


In [12]:
y = df['Buy/Sell']
#Reviewing Label.
y.head(5)
#y.value_counts()
## Found class Imbalances in the data set. 
# Suggested to use SMOTE Upsampling to balane the class.


Date
1993-01-29    0
1993-02-01    1
1993-02-02    1
1993-02-03    1
1993-02-04    1
Name: Buy/Sell, dtype: object

In [13]:
# Split the data into training and testing datasets
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state =1)

### Passing the Data to NN to establish a baseline accuracy.

#### Normalising the data using Standard Scaler

In [14]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# encoding class labels as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)

#### Adding Layers to Neural Network

In [15]:
def create_model():
	# create model
	model = Sequential()
	model.add(Dropout(0.2, input_shape=(10,)))
	model.add(Dense(10, activation='relu', kernel_constraint=MaxNorm(3)))
	model.add(Dense(5, activation='relu', kernel_constraint=MaxNorm(3)))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	sgd = SGD(learning_rate=0.1, momentum=0.9)
	model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
	return model
 

In [16]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_model, epochs=100, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)

In [17]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)

In [18]:
results = cross_val_score(pipeline, X, y, cv=kfold)

In [19]:
print("Visible: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### Additional tune up and Learning rate and Epochs needed. 

Visible: 25.08% (0.02%)


### Creating Ensemble

In [20]:
from sklearn.metrics import log_loss
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=500, random_state=1)
clf3 = GaussianNB()
#clf4 = XGBClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='hard')


eclf.fit(X_train, y_train)
 
# predicting the output on the test dataset
pred_final = eclf.predict(X_test)
 
# printing log loss between actual and predicted value
print(accuracy_score(y_test, pred_final))

0.7342917997870074


In [21]:
for clf, label in zip([clf1, clf2, clf3,eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=9)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.65 (+/- 0.00) [Logistic Regression]
Accuracy: 1.00 (+/- 0.00) [Random Forest]
Accuracy: 0.68 (+/- 0.01) [Naive Bayes]
Accuracy: 0.72 (+/- 0.01) [Ensemble]


In [22]:
## Models to be tuned up for better accuracy. 