In [36]:
import yfinance as yf 
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,TimeSeriesSplit
from sklearn.metrics import accuracy_score,f1_score,make_scorer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier,BaggingClassifier,AdaBoostClassifier,StackingClassifier

# Objectives

The main objective for this project is to study the impact on applying Ensemble method in predicting the price movement in stock market. Over this two decades, many researchers have shifted their attention into using Machine Learning algorithms to predict the stock market. This became more common as the advancement of Deep Learning technique such RNN has proven a lot of success in predicting the stock market. However, Ensemble method has also gained its reputation in these recent years due its success in improving the model's accuracy in various sectors such as health, agriculture, energy and finance. Besides that, it has always been adopted by Machine Learning research team at Kaggle competitions where they often achieve impressive results. For example, the top two teams in Netflix competition also utilise Ensemble method to improve their model. Hence, in this project, we will be looking at how Ensemble methods will improve the accuracy in predicting stock market.

### Project Framework

The project will be seperated into different parts and it will be listed in the below:
- Feature Construction
- Data Pre-processing
- Strategy for cross validation
- Building of Machine Learning and Ensemble methods
- Comparing the accuracy and error metrics of models

### Feature Construction

In analysing the stock market, the two common methods that have been used are namely: fundamental analysis and technical analysis. Fundamental analyst use financial statements and industry trends to see if an asset is under or overvalued. However, for technical analysis, it uses historic market price to forecast the direction of market price, primarily price and volume. 

For this project, we will adopt the strategy of using 9 different technical indicators from technical analysis to construct our feature input to Machine Learning models and Ensemble method.

### Fetch Dataset

The Yahoo finance API provides financial information such as financial news, price data and financial reports of a stock. In this project, we will retrieve the opening price, closing price and volume for a certain period.

Instead of making individual stock prediction, we will use Nifty 50 and S&P 500 for stock index prediction as these stock index consists a list of top companies in the market. Besides that, using stock index from different continents will also helps us in building a more roboust model

In [7]:
start_date = "2000-01-01"
end_date = "2021-01-31"

In [8]:
sp500 = yf.download("SPY",start= start_date,end=end_date)
nifty50 = yf.download("^NSEI",start= start_date,end=end_date)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


### Simple Moving Average (SMA) - 10 days

- Simple moving average is a technical analysis tools that smooths out price data by creating a constantly updated average price
- However, simple moving averages don't make prediction about the future value of a stock, it simply reveal the trend of the stock over a period of time

Formula for Simple Moving Averages:<br><br>
$SMA = \frac {C _{1} + C _{2} + ... + C _{n}}{n}$<br><br>
where:<br> 
$C$ = The closing price of a stock<br>
$n$ = the number of total periods



In [5]:
def SMA(data):
    return data['Close'].rolling(window=10).mean()

### Weighted Moving Average (WMA) - 10 days

- Weighted moving average is very similar to SMA as it also smooths out the price data but WMA gives more weight to the recent price and gradually less as we look back in time
- For example the closing price of stock Z for the past 3 days are 20, 30 and 50 usd. Since it is a three-day period, the most recent day(50 usd) will multiply with the largest weight(3) and the second day(30 usd) will multiply with the second largest weight(2) and so on
- The sum of the value after multiplying with the weight will be divide by the sum of the periods which is 1+2+3 = 6

Formula for Weighted Moving Average:<br><br>
$WMA = \frac {C_{1}W_{1} + C_{2}W_{2} + ... + C_{n}W_{n}}{W_{n} + W_{n-1} + ... + 1}$<br><br>
where:<br>
$C$ = The closing price of a stock<br>
$W$ = The weighting factor<br>
$n$ = Number of periods in the weighting group

In [9]:
def WMA(data):
    weights = np.arange(1,11)
    return data['Close'].rolling(10).apply(lambda price: np.dot(prices,weights)/weights.sum(),raw=True)

### Relative Strength Index (RSI)

- RSI measures the extent of recent price changes to determine overbought or oversold conditions in an instruments price.
- It is one of the most trusted indicators for anyone planning to use oscillators to determine buy and sell points

The formula for RSI:<br><br>
$RS = \frac{Avg.Gain}{Avg.Loss}$ <br><br>
$RSI = 100 - \frac {100} {1+RS}$



In [10]:
def RSI(data,period):
    df = pd.DataFrame()
    df['Diff'] = data['Close'].transform(lambda x: x.diff())

    df['Up'] = df['Diff']
    df.loc[(df['Up']<0),'Up'] = 0

    df['Down'] = df['Diff']
    df.loc[(df['Down']>0),'Down'] = 0
    df['Down'] = abs(df['Down'])

    df['rsi_up'] = df['Up'].rolling(window=period).mean()
    df['rsi_down'] = df['Down'].rolling(window=period).mean()
    
    df['rs'] = df['rsi_up'] / df['rsi_down']
    df['rsi'] = round(100 - (100/(1+df['rs'])))
    return df['rsi']
    

### Momentum 

- Momentum is a simple technical indicator used in the financial field
- It simply calculate the rate of change of the closing price in a certain period
- For example, a ten-days momentum can be calculated by subtracting the closing price in the most recent day and 10th day<br><br>

The formula for Momentum:

$mom = C_{t} - C_{t-9}$<br>
where:<br>
$C$ = The closing price<br>
$t$ = The number of period

In [11]:
def mom(data):
    close_9 = data['Close'].shift(9)
    momentum = data['Close'] - close_9
    return momentum

### Williams %R

- William %R is a momentum indicator developed by Larry Williams that moves between 0 and -100
- It is used to measure overbought and oversold levels to allow traders to find entry and exit point in the stock market<br><br>

The formula for William %R:

$WilliamsR = \frac {H_{n} - C_{t}}{H_{n} - L_{n}} * -100$<br><br>
where:<br>
$H_{n}$ = The high price at period n<br>
$L_{n}$ = The low price at period n<br>
$C_{t}$ = The high price at time t


In [17]:
def williamR(data):
    high = data['High'].rolling(window=14).max()
    low = data['Low'].rolling(window=14).min()
    wr = -100 * ((high-data['Close']) / (high-low))
    return wr
    

### Commodity Channel Index (CCI)
- CCI is a momentum-based oscillator that developed by Donald Lambert
- It is used to examine the price trend direction and strength to allow traders to decide the entry and exit point in the stock market
- CCI was originally developed to spot long-term trend changes, however traders have been use it on all different timeframes.<br><br>

The formula for CCI:<br>
$CCI = \frac {TP - MA} {0.015 * MD}$<br><br>
where:<br>
TP = Typical Price<br>
Typical Price = $\sum \limits _{i=1} ^{P} ((High + Low + Close) \div 3)$<br>
MA = Moving Average<br>
Moving Average = $(\sum \limits _{i=1} ^{P} Typical Price) \div P $<br>
Mean Deviation = $ (\sum \limits _{i=1} ^{P} |Typical Price - MA|) \div P $<br>
P = Number of periods


In [19]:
def CCI(data):
    data['tp'] = (data['High'] + data['Low'] + data['Close']) / 3
    data['sma_tp'] = data['tp'].rolling(window=12).mean()
    data['mad'] = data['tp'].rolling(window=12).apply(lambda x: pd.Series(x).mad())
    data['CCI'] = (data['tp'] - data['sma_tp']) / (0.015 * data['mad'])
    return data['CCI']

### Moving Average Convergence Divergence (MACD)

- Moving average convergence divergence is also another famous trend-based momentum indicator that shows the relationship between two different period of moving averages of a security's price
- It is calculated by subtracting the 26-period exponential moving average from the 12-period exponential moving average
- It is often used to determine whether the market is in bullish or bearish




The formula for MACD:<br><br>
$MACD = EMA _{12period} - EMA _{26period}$

In [21]:
def MACD(data,period1,period2,period3):
    macd_1 = data['Close'].ewm(span=period1,adjust=False,min_periods=period1).mean()
    macd_2 = data['Close'].ewm(span=period2,adjust=False,min_periods=period2).mean()
    macd = macd_1 - macd_2
    signal = macd.ewm(span=period3,adjust=False,min_periods=period3).mean()
    
    return macd,signal

### Stochastic Oscillator

 - Stochastic Oscillator measures the momentum of price movement 
 - Stochastic Oscillator is range-bound, which means it is always between 0 and 100 and this can be used to identify overbought and oversold levels
 - There are two different stochastic oscillator, namely %K and %D where %D is the 3-day simple moving average of %K

### Stochastic %K
The formula for Stochastic K:<br><br>

$ K = \frac {(C - L _{period})} {(H _{period} - L _{period})} * 100$ <br><br>

where: <br>
$ C $ = The most recent closing price  <br>
$ L _{period} $ = The lowest price traded of the trading period  <br>
$ L _{period} $ = The highest price traded of the trading period  <br>

In [24]:
def stochastic_k(data,period):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(period).max()
    df['low'] = data['Low'].rolling(period).min()
    df['percentage_k'] = (data['Close'] - df['low']) * 100 / (df['high'] - df['low'])
    
    return df['percentage_k']

### Stochastic %D
The formula for Stochastic D:<br><br>

$ D = \frac {K _{1} + K _{2} .... + K _{n}} {n}$
<br><br>
where:<br>
$ K $ = Fast Stochastic indicator <br>
$ n $ = Trading period

In [23]:
def stochastic_d(data,period1,period2):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(period1).max()
    df['low'] = data['Low'].rolling(period1).min()
    df['percen_k'] = (data['Close'] - df['low']) * 100 / (df['high'] - df['low'])
    df['percen_d'] = df['percen_k'].rolling(window=period2).mean()
    
    return df['percen_d']

### Data Pre-processing

A research done by Pajan shows that an another approach of using technical indicators does improve the model. This approach is to transform the continous value obtained from technical indicators into discrete form. In general, every technical indicators have its own way of interpreting the market. For example, in RSI, traders often sells their stock when RSI value exceeds 70 or above and buys when RSI value goes 30 or below. In this part, we will first transform each technical indicators that we constructed into discrete form. 

### Simple Moving Average 10-days (SMA)

SMA and EMA are both used to smooth out the price data by constantly update the average price. In here, if SMA and EMA are higher than the current price, then the trend is 'down' and denote as '-1'. If SMA and EMA are lower than the current price, then the trend is 'up' and denote as '+1'

In [25]:
#Discrete 
def SMA_discrete(data,period,column='Close'):
    data['sma'] = data[column].rolling(window=period).mean()
    data['sma_discrete'] = np.where(data['sma'] > data['Close'],-1,1)
    return data['sma_discrete']

### Weighted Moving Average 10-days (WMA)

In [27]:
def WMA_discrete(data,period,column="Close"):
    weights = np.arange(1,11)
    data['wma'] = data['Close'].rolling(10).apply(lambda price: np.dot(prices,weights)/weights.sum(),raw=True)
    data['wma_discrete'] = np.where(data['wma'] > data['Close'],-1,1)
    return data['wma_discrete']

### Relative Strength Index (RSI)

For RSI, if the value of RSI is higher than 70, it means the stock is overbought and we will denote it as '-1' and if the value of RSI is lower than 30, it means the stock is oversold and we will denote it as '+1'. For the case that the value of RSI is between 30 and 70, if the value of RSI at time 't' is higher than 't-1', we will denote it as '+1' and vice-a-versa

In [28]:
def cond_rsi(data):
    if (data.rsi < 70) & (data.rsi > 30):
        if (data.rsi > data.previous):
            return 1
        else :
            return -1
    elif (data.rsi > 70):
        return -1
    else :
        return 1 




def rsi_discrete(data,period):
    df = pd.DataFrame()
    df['Diff'] = data['Close'].transform(lambda x: x.diff())

    df['Up'] = df['Diff']
    df.loc[(df['Up']<0),'Up'] = 0

    df['Down'] = df['Diff']
    df.loc[(df['Down']>0),'Down'] = 0
    df['Down'] = abs(df['Down'])

    df['rsi_up'] = df['Up'].rolling(window=period).mean()
    df['rsi_down'] = df['Down'].rolling(window=period).mean()
    
    df['rs'] = df['rsi_up'] / df['rsi_down']
    df['rsi'] = round(100 - (100/(1+df['rs'])))
    df['previous'] = df['rsi'].shift(1)
    
    df['discrete_rsi'] = df.apply(cond_rsi,axis=1)
    return df['discrete_rsi']

### Momentum

Since momentum calculates the rate of change of stock prices. A positive value of momentum indicates 'up' trend and we will denote as '+1' and a negative value of momentum indicates 'down' trend and will be denote as '-1'

In [29]:
def mom_discrete(data):
    data['close_9'] = data['Close'].shift(9)
    data['momentum'] = data['Close'] - data['close_9']
    data['discrete_momentum'] = np.where(data['momentum'] > 0 , 1, 0)
    return data['discrete_momentum']
    

### Commodity Channel Index (CCI)

In this part, we set 200 as overbought level and -200 as oversold level. Hence, if the value of CCI is higher than 200, we will denote as '-1' and vice-a-versa. For the case when the value of CCI is between -200 and 200, if the value of CCI at time 't' is higher than time 't-1', the trend is 'up' and we denote as '+1'.

In [31]:
def cond_CCI(row):
    if (row.CCI <= 200) & (row.CCI >= -200):
        if (row.CCI > row.dif):
            return 1
        else:
            return -1
    elif (row.CCI > 200):
        return -1
    else:
        return 1


def CCI_discrete(data):
    df = pd.DataFrame()
    df['tp'] = (data['High'] + data['Low'] + data['Close']) / 3
    df['sma_tp'] = df['tp'].rolling(window=9).mean()
    df['mad'] = df['tp'].rolling(window=9).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['tp'] - df['sma_tp']) / (0.015 * df['mad'])
    df['dif'] = df['CCI'].shift(1)
    df['CCI_discrete'] = df.apply(cond_CCI,axis=1)
    
    
    return df['CCI_discrete']

### Moving Average Convergence Divergence (MACD)

MACD follows the trend of the stock price, which means that if the value of MACD goes up then the stock price is also rising. Hence, the interpretation of MACD in this project will be looking at if the value of MACD at time 't' is greater than the value at time 't-1', then the trend is 'up' and we will denote as '+1' and vice-a-versa

In [32]:
def cond_MACD(data):
    if (data.macd > data.dif):
        return 1
    else:
        return -1
    

def MACD_discrete(data):
    df = pd.DataFrame()
    df['macd_1'] = data['Close'].ewm(span=26,adjust=False,min_periods=26).mean()
    df['macd_2'] = data['Close'].ewm(span=9,adjust=False,min_periods=9).mean()
    df['macd'] = df['macd_1'] - df['macd_2']
    df['dif'] = df['macd'].shift(1)
    df['discrete_macd'] = df.apply(cond_MACD,axis=1)
    
    
    return df['discrete_macd']

Stochastic %K, %D and William %R are all stochastic oscillator, as all normalizes price as a percentage between 0 and 100. Hence, we will be using the same interpretation for all these three technical indicators. When the indicators at time 't' is higher than the value at time 't-1' then the trend is 'up' and we denote as '+1' and vice-a-versa

### Stochastic Oscillator

In [33]:
#Stochastic %K
def cond_sto_k(data):
    if (data.percen_k > data.previous):
        return 1
    else:
        return -1

def discrete_sto_k(data,period):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(period).max()
    df['low'] = data['Low'].rolling(period).min()
    df['percen_k'] = (data['Close'] - df['low']) * 100 / (df['high'] - df['low'])
    df['previous'] = df['percen_k'].shift(1)
    df['discrete_k'] = df.apply(cond_sto_k,axis=1) 
    
    return df['discrete_k']

In [34]:
#Stochastic %D
def cond_sto_d(data):
    if (data.percen_d > data.previous):
        return 1
    else:
        return -1


def discrete_sto_d(data,period1,period2):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(period1).max()
    df['low'] = data['Low'].rolling(period1).min()
    df['percen_k'] = (data['Close'] - df['low']) * 100 / (df['high'] - df['low'])
    df['percen_d'] = df['percen_k'].rolling(window=period2).mean()
    df['previous'] = df['percen_d'].shift(1)
    df['discrete_d'] = df.apply(cond_sto_d,axis=1)
    
    return df['discrete_d']

### Williams %R

In [None]:
def cond_wr(data):
    if (data.wr > data.previous):
        return 1
    else:
        return -1



def discrete_wr(data,period):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(window=period).max()
    df['low'] = data['Low'].rolling(window=period).min()
    df['wr'] = -100 * ((df['high']-data['Close']) / (df['high'] - df['low']))
    df['previous'] = df['wr'].shift(1)
    df['discrete_wr'] = df.apply(cond_wr,axis=1)
    
    return df['discrete_wr']