In [2]:
import yfinance as yf 
import pandas as pd
from scipy.stats import skew
import talib
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score,f1_score,make_scorer
from sklearn.svm import SVC

In [3]:
start_date = "2000-01-01"
end_date = "2021-01-31"

In [4]:
aapl = yf.download("AAPL",start=start_date,end = end_date)

[*********************100%***********************]  1 of 1 completed


In [5]:
aapl.head(26)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,0.936384,1.004464,0.907924,0.999442,0.855796,535796800
2000-01-04,0.966518,0.987723,0.90346,0.915179,0.783644,512377600
2000-01-05,0.926339,0.987165,0.919643,0.928571,0.795111,778321600
2000-01-06,0.947545,0.955357,0.848214,0.848214,0.726304,767972800
2000-01-07,0.861607,0.901786,0.852679,0.888393,0.760708,460734400
2000-01-10,0.910714,0.912946,0.845982,0.872768,0.747329,505064000
2000-01-11,0.856585,0.887277,0.808036,0.828125,0.709102,441548800
2000-01-12,0.848214,0.852679,0.772321,0.77846,0.666575,976068800
2000-01-13,0.84361,0.881696,0.825893,0.863839,0.739683,1032684800
2000-01-14,0.892857,0.912946,0.887277,0.896763,0.767875,390376000


# Technical Indicators
- Technical indicators are heuristic or pattern-based signals produced by the price,volume, and/or open interest of a security used by traders who follow technical analysis<br>
- There are four major types of technical indicators that are commonly used by traders, which are:
    - Trend Indicators
    - Momentum Indicators
    - Volatility Indicators 
    - Volume Indicators
- These types of indicators can also be subdivided into leading or lagging
    - Lagging indicators offer a historical report of background conditions that resulted in the current price being where it is
    - Leading indicators attempt to predict where the price is headed
- The technical indicators that I created in this project will be used as the input features for Machine Learning models.

In this section, I will separate the types of indicators and provides further explanation and formula for each indicators. 

## Trend Indicators

Trend indicators (lagging) analyze whether a market is moving up,down or sideways over time



### Simple Moving Averages (SMA)
- Simple moving average is a technical analysis tools that smooths out price data by creating a constantly updated average price
- Most of the time traders will look at the angle of moving average, if it is mostly moving horizontally, then the price isn't trending, it is ranging
- If the simple moving average line is angled up, an uptrend is underway
- However, simple moving averages don't make prediction about the future value of a stock, it simply reveal the trend of the stock over a period of time

Formula for Simple Moving Averages:<br>
$SMA = \frac {A _{1} + A _{2} + ... + A _{n-1} + A _{n}}{n}$<br><br>
where:<br> 
$A _{n}$ = the price of an asset at period n<br>
$n$ = the number of total periods



In [6]:
#Simple moving average
def SMA(data,period,column="Close"):
    return data[column].rolling(window=period).mean()

In [7]:
#Discrete 
def SMA_discrete(data,period,column='Close'):
    data['sma'] = data[column].rolling(window=period).mean()
    data['sma_discrete'] = np.where(data['sma'] > data['Close'],-1,1)
    return data['sma_discrete']

### Exponential Moving Averages (EMA)
- Exponential Moving Averages is very similar to Simple Moving Averages, the only difference is that EMA reacts more significantly to recent price changes than a simple moving average, which applies an equal weight to all observations in the period<br>

Formula for Exponential Moving Averages:<br>

$EMA _{t} = \alpha x _{t} + (1 - \alpha)EMA _{t-1} $ <br><br>

The smoothing factor alpha is defined as:<br>
$\alpha = \frac {2} {n+1} $ 

where n is the time period




In [8]:
def EMA(data,period,column="Close"):
    return data[column].ewm(span=period, adjust=False,min_periods=period).mean()

In [9]:
def EMA_discrete(data,period,column="Close"):
    data['ema'] = data[column].ewm(span=period,adjust=False,min_periods=period).mean()
    data['ema_discrete'] = np.where(data['ema']>data['Close'],-1,1)
    return data['ema_discrete']

## Momentum Indicators

- Oscillators are chart indicators that can assist a trader in determining overbought or oversolf conditions in ranging markets.
- A technician will rely on oscillators when the charts are not showing a definite trend in either direction

### Relative Strength Index (RSI)

- RSI measures the extent of recent price changes to determine overbought or oversold conditions in an instruments price.
- It is one of the most trusted indicators for anyone planning to use oscillators to determine buy and sell points
- If the RSI value is over 70, the security is considered overbought, if the value is lower than 30, it is considered oversolf.

The formula for RSI:<br><br>
$RS = \frac{Avg.Gain}{Avg.Loss}$ <br><br>
$RSI = 100 - \frac {100} {1+RS}$


In [10]:
#RSI 
def RSI(data,period):
    df = pd.DataFrame()
    df['Diff'] = data['Close'].transform(lambda x: x.diff())

    df['Up'] = df['Diff']
    df.loc[(df['Up']<0),'Up'] = 0

    df['Down'] = df['Diff']
    df.loc[(df['Down']>0),'Down'] = 0
    df['Down'] = abs(df['Down'])

    df['rsi_up'] = df['Up'].rolling(window=period).mean()
    df['rsi_down'] = df['Down'].rolling(window=period).mean()
    
    df['rs'] = df['rsi_up'] / df['rsi_down']
    df['rsi'] = round(100 - (100/(1+df['rs'])))
    return df['rsi']
    

In [None]:
def cond_rsi(data):
    if (data.rsi < 70) & (data.rsi > 30):
        if (data.rsi > data.previous):
            return 1
        else :
            return -1
    elif (data.rsi > 70):
        return -1
    else :
        return 1 




def rsi_discrete(data,period):
    df = pd.DataFrame()
    df['Diff'] = data['Close'].transform(lambda x: x.diff())

    df['Up'] = df['Diff']
    df.loc[(df['Up']<0),'Up'] = 0

    df['Down'] = df['Diff']
    df.loc[(df['Down']>0),'Down'] = 0
    df['Down'] = abs(df['Down'])

    df['rsi_up'] = df['Up'].rolling(window=period).mean()
    df['rsi_down'] = df['Down'].rolling(window=period).mean()
    
    df['rs'] = df['rsi_up'] / df['rsi_down']
    df['rsi'] = round(100 - (100/(1+df['rs'])))
    df['previous'] = df['rsi'].shift(1)
    
    df['discrete_rsi'] = df.apply(cond_rsi,axis=1)
    return df['discrete_rsi']

### Momentum

In [11]:
def mom(data):
    close_9 = data['Close'].shift(9)
    momentum = data['Close'] - close_9
    return momentum

In [None]:
def mom_discrete(data):
    data['close_9'] = data['Close'].shift(9)
    data['momentum'] = data['Close'] - data['close_9']
    data['discrete_momentum'] = np.where(data['momentum'] > 0 , 1, 0)
    return data['discrete_momentum']
    

### William %R

In [12]:
def williamR(data,period):
    high = data['High'].rolling(window=period).max()
    low = data['Low'].rolling(window=period).min()
    wr = -100 * ((high-data['Close']) / (high-low))
    return wr
    

In [None]:
def cond_wr(data):
    if (data.wr > data.previous):
        return 1
    else:
        return -1



def discrete_wr(data,period):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(window=period).max()
    df['low'] = data['Low'].rolling(window=period).min()
    df['wr'] = -100 * ((df['high']-data['Close']) / (df['high'] - df['low']))
    df['previous'] = df['wr'].shift(1)
    df['discrete_wr'] = df.apply(cond_wr,axis=1)
    
    return df['discrete_wr']

### CCI

In [13]:
def CCI(data):
    data['tp'] = (data['High'] + data['Low'] + data['Close']) / 3
    data['sma_tp'] = data['tp'].rolling(window=9).mean()
    data['mad'] = data['tp'].rolling(window=9).apply(lambda x: pd.Series(x).mad())
    data['CCI'] = (data['tp'] - data['sma_tp']) / (0.015 * data['mad'])
    return data['CCI']

In [None]:
def cond_CCI(row):
    if (row.CCI <= 200) & (row.CCI >= -200):
        if (row.CCI > row.dif):
            return 1
        else:
            return -1
    elif (row.CCI > 200):
        return -1
    else:
        return 1


def CCI_discrete(data):
    df = pd.DataFrame()
    df['tp'] = (data['High'] + data['Low'] + data['Close']) / 3
    df['sma_tp'] = df['tp'].rolling(window=9).mean()
    df['mad'] = df['tp'].rolling(window=9).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['tp'] - df['sma_tp']) / (0.015 * df['mad'])
    df['dif'] = df['CCI'].shift(1)
    df['CCI_discrete'] = df.apply(cond_CCI,axis=1)
    
    
    return df['CCI_discrete']

### Moving Average Convergence Divergence (MACD)

 - MACD is a trend-following momentum indicator that shows the relationship between two moving averages of a security's price
 - A nine-day EMA of the MACD called 'signal line' is used as a trigger for buy and sell signals
 - When the MACD line is crosses below the signal line, traders should sell
 - When the MACD line crosses above the signal line, traders should buy

The formula for MACD:<br><br>
$MACD = EMA _{12period} - EMA _{26period}$

The formula for Signal:<br><br>
$Signal $= 9-days EMA of MACD Line


In [14]:
def MACD(data,period1,period2,period3):
    macd_1 = data['Close'].ewm(span=period1,adjust=False,min_periods=period1).mean()
    macd_2 = data['Close'].ewm(span=period2,adjust=False,min_periods=period2).mean()
    macd = macd_1 - macd_2
    signal = macd.ewm(span=period3,adjust=False,min_periods=period3).mean()
    
    return macd,signal

In [None]:
def cond_MACD(data):
    if (data.macd > data.dif):
        return 1
    else:
        return -1
    

def MACD_discrete(data):
    df = pd.DataFrame()
    df['macd_1'] = data['Close'].ewm(span=26,adjust=False,min_periods=26).mean()
    df['macd_2'] = data['Close'].ewm(span=9,adjust=False,min_periods=9).mean()
    df['macd'] = df['macd_1'] - df['macd_2']
    df['dif'] = df['macd'].shift(1)
    df['discrete_macd'] = df.apply(cond_MACD,axis=1)
    
    
    return df['discrete_macd']

### Stochastic Oscillator

 - Stochastic Oscillator measures the momentum of price movement and can be used to predict trend reversals 
 - Stochastic Oscillator is range-bound, which means it is always between 0 and 100.
 - It can be used to identify overbought and oversold readings

Stochastic Oscillator display two lines: %K and %D.
 - The %K line compares the lowest low and the highest high of a given period to define a price range
 - The %D line is a moving average of %K
 - If the %K line crosses below the %D line, a possible sell signal is generated
 - If %D crosses below the %K line, a possible buy signal is generated
 - These crossover may appear anywhere, but signals above the lines at 20 and 80 are considered to be stronger


The formula for Stochastic K:<br><br>

$ K = \frac {(C - L _{period})} {(H _{period} - L _{period})} * 100$ <br><br>

where: <br>
$ C $ = The most recent closing price  <br>
$ L _{period} $ = The lowest price traded of the trading period  <br>
$ L _{period} $ = The highest price traded of the trading period  <br>


In [15]:
def stochastic_k(data,period):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(period).max()
    df['low'] = data['Low'].rolling(period).min()
    df['percentage_k'] = (data['Close'] - df['low']) * 100 / (df['high'] - df['low'])
    
    return df['percentage_k']

The formula for Stochastic D:<br><br>

$ D = \frac {K _{1} + K _{2} .... + K _{n}} {n}$
<br><br>
where:<br>
$ K $ = Fast Stochastic indicator <br>
$ n $ = Trading period


In [16]:
def stochastic_d(data,period1,period2):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(period1).max()
    df['low'] = data['Low'].rolling(period1).min()
    df['percen_k'] = (data['Close'] - df['low']) * 100 / (df['high'] - df['low'])
    df['percen_d'] = df['percen_k'].rolling(window=period2).mean()
    
    return df['percentage_d']

In [None]:
def cond_sto_d(data):
    if (data.percen_d > data.previous):
        return 1
    else:
        return -1


def discrete_sto_d(data,period1,period2):
    df = pd.DataFrame()
    df['high'] = data['High'].rolling(period1).max()
    df['low'] = data['Low'].rolling(period1).min()
    df['percen_k'] = (data['Close'] - df['low']) * 100 / (df['high'] - df['low'])
    df['percen_d'] = df['percen_k'].rolling(window=period2).mean()
    df['previous'] = df['precen_d'].shift(1)
    df['discrete_d'] = np.apply(cond_sto_d,axis=1)
    
    return df['discrete_d']

### Target Variable
Since technical indicators are mainly used for short term trading purpose, hence I will be looking at a short term trading period. In this case, I will be using 3 days prediction interval. The idea is when traders look at the technical indicators on today, traders will buy the stock on the next day and hold it for 3 trading days including the day he/she bought. If the closing price on the third day is higher than the opening price on the first day, the value will 1 while the closing price on the third day is lower than the opening price on the first day, the value will be 0

To give a more comprehensive view, if a trader looking at technical indicators on 5th of Janurary, the trader will buy the stock on 6th of January and hold the stock until 8th of January. 

In [17]:
def pred_var(data):
    var = pd.DataFrame()
    var['Close_in_3'] = data['Close'].transform(lambda x: x.shift(-2))
    var['diff'] = (var['Close_in_3'] - data['Open']).shift(-1)
    var['act_pred'] = np.where(var['diff']>0,1,0)
    var['act_pred'].mask(var['diff'].isna(),np.nan,inplace=True)
    return var['act_pred']

### Data Cleaning and Pre-processing

The first step of data pre-processing to ensure there is no imbalanced dataset issue occur. when looking at the value counts, it shows that target variable 1 and 0 have very similar amount of value counts. Hence, imbalanced dataset does not occur in this case

In [18]:
y = pred_var(aapl)

In [19]:
x = pd.DataFrame()
x["sma_10"] = SMA(aapl,10)
x["ema_10"] = EMA(aapl,10)
x["rsi_14"] = RSI(aapl,14)
x["macd"] = MACD(aapl,12,26,9)[0]
x["signal"] = MACD(aapl,12,26,9)[1]
x['%K'] = stochastic_k(aapl,14)
x['%D'] = stochastic_d(aapl,14,3)
x['mom'] = mom(aapl)
x['william%r'] = williamR(aapl,14)
x['cci'] = CCI(aapl)
x.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5303 entries, 2000-01-03 to 2021-01-29
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sma_10     5294 non-null   float64
 1   ema_10     5294 non-null   float64
 2   rsi_14     5289 non-null   float64
 3   macd       5278 non-null   float64
 4   signal     5270 non-null   float64
 5   %K         5290 non-null   float64
 6   %D         5288 non-null   float64
 7   mom        5294 non-null   float64
 8   william%r  5290 non-null   float64
 9   cci        5295 non-null   float64
dtypes: float64(10)
memory usage: 455.7 KB


In [20]:
y = y.loc["2010-01-01":"2020-12-31"]
print(y.value_counts())


1.0    1555
0.0    1214
Name: act_pred, dtype: int64


In [21]:
print(np.sum(y.isnull()))

0


In [22]:
x = x.loc["2010-01-01":"2020-12-31"]
print(np.sum(x.isnull()))

sma_10       0
ema_10       0
rsi_14       0
macd         0
signal       0
%K           0
%D           0
mom          0
william%r    0
cci          0
dtype: int64


In [23]:
x.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2769 entries, 2010-01-04 to 2020-12-31
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sma_10     2769 non-null   float64
 1   ema_10     2769 non-null   float64
 2   rsi_14     2769 non-null   float64
 3   macd       2769 non-null   float64
 4   signal     2769 non-null   float64
 5   %K         2769 non-null   float64
 6   %D         2769 non-null   float64
 7   mom        2769 non-null   float64
 8   william%r  2769 non-null   float64
 9   cci        2769 non-null   float64
dtypes: float64(10)
memory usage: 238.0 KB


In [24]:
#Split dataset into train and test
x_train = x.loc["2010-01-01":"2017-12-31"]
x_test = x.loc["2018-01-01":"2020-12-31"]
y_train = y.loc["2010-01-01":"2017-12-31"]
y_test = y.loc["2018-01-01":"2020-12-31"]
print("Number of Training dataset:", x_train.shape[0])
print("Number of Testing dataset:",x_test.shape[0])
print("Train dataset percentage:", x_train.shape[0]/(x_train.shape[0] + x_test.shape[0]))
print("Test dataset percentage:", x_test.shape[0]/(x_train.shape[0] + x_test.shape[0]))

Number of Training dataset: 2013
Number of Testing dataset: 756
Train dataset percentage: 0.7269772481040087
Test dataset percentage: 0.27302275189599134


In [25]:
y_train.value_counts()

1.0    1105
0.0     908
Name: act_pred, dtype: int64

In [26]:
y_test.value_counts()

1.0    450
0.0    306
Name: act_pred, dtype: int64

In [27]:
print("Value for 1.0:",(((y_test.value_counts()[1]) / (len(y_test)))*100))
print("Value for 0.0:",(((y_test.value_counts()[0]) / (len(y_test)))*100))

Value for 1.0: 59.523809523809526
Value for 0.0: 40.476190476190474


In [28]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = pd.DataFrame(scaler.transform(x_train),index = x_train.index, columns = x_train.columns)
x_test_scaled = pd.DataFrame(scaler.transform(x_test),index = x_test.index, columns = x_test.columns)

# Classification

### Logistic Regression

In [29]:
time = TimeSeriesSplit(n_splits = 3)
scorer = make_scorer(f1_score)

In [30]:
svc = SVC()
kernel = ['rbf','poly']
degree = [3,5,7]
C = [10,1.0,0.5]
gamma = [10,1.0,0.5]
params_grid = {'C':C,'gamma':gamma,'kernel':kernel,'degree':degree}
svc_grid = GridSearchCV(svc,params_grid,cv=time,scoring='accuracy',n_jobs = -1,verbose = 3)
svc_grid.fit(x_train_scaled,y_train)
print(svc_grid.best_params_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 129.1min


KeyboardInterrupt: 

In [42]:
svc = SVC(kernel='poly')
C = [10,1.0,0.5]
gamma = [10,1.0,0.5]
params_grid = {'C':C,'gamma':gamma}
svc_grid = GridSearchCV(svc,params_grid,cv=time,scoring='accuracy',n_jobs = -1,verbose = 3)
svc_grid.fit(x_train_scaled,y_train)
print(svc_grid.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  22 out of  27 | elapsed:   28.1s remaining:    6.4s


{'C': 0.5, 'gamma': 0.5}


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  5.9min finished


### Support Vector Machine

In [43]:
pred = svc_grid.predict(x_test_scaled)
unique,counts = np.unique(pred,return_counts=True)
print(dict(zip(unique,counts)))

{0.0: 150, 1.0: 606}
