## STOCK UP/DOWN PREDICTION

In [1]:
import pandas as pd
import pandas_datareader.data as web
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator
from hmm import viterbi, baum_welch

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### UTILS

In [2]:
map_bit = {
    'L': 0,
    'M': 1,
    'H': 2
}

def convert(s: str):
    assert len(s) == 3
    val = 0
    for i in range(3):
        val += map_bit[s[i]] * (3**i)
    return val

def build_transition_matrix(df: pd.DataFrame):
    transition_matrix = np.zeros((2, 2), dtype=np.float32)
    for i in range(2):
        cnt = [0 for _ in range(2)]
        for idx in range(len(df)-1):
            if df['Outcome_Next_Day_Direction'][idx] == i:
                cnt[df['Outcome_Next_Day_Direction'][idx+1]] += 1 
        for j in range(2):
            transition_matrix[i][j] = cnt[j]/sum(cnt)
    return transition_matrix

def build_emission_matrix(df: pd.DataFrame, limit: int):
    emission_matrix = np.zeros((2, limit), dtype=np.float32)
    for j in range(2):
        cnt = [0 for _ in range(limit)]
        for idx in range(len(df)):
            if df['Outcome_Next_Day_Direction'][idx] == j:
                cnt[df['Encode_Event'][idx]] += 1
        for i in range(limit):
            emission_matrix[j, i] = cnt[i]/sum(cnt)
    return emission_matrix

### READ DATA
Using AMAZON stock data.

In [3]:
df = pd.read_csv('data/AMZ_data.csv')
df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2017-01-03,758.76001,747.700012,757.919983,753.669983,3521100,753.669983
1,2017-01-04,759.679993,754.200012,758.390015,757.179993,2510500,757.179993
2,2017-01-05,782.400024,760.26001,761.549988,780.450012,5830100,780.450012
3,2017-01-06,799.440002,778.47998,782.359985,795.98999,5986200,795.98999
4,2017-01-09,801.77002,791.77002,798.0,796.919983,3446100,796.919983


| Key                   | Definition    |
|-------                |--------       |
| **High** and **Low**  | Refer to the maximum and minimum prices in a given time period |
| **Open** and **Close**| The prices at which a stock began and ended trading in the same period |
| **Volume**            | The total amount of trading activity |
| **Adj Close**         | The stock's value after accounting for any corporate actions |

### PROCESS DATA
- Binning values into 3 buckets, split the value into 3 groups - Low (L), Medium (M), High (H).  
- Creating two markov chains, 1 for volume jumps, and 0 for volume drops.

In [4]:
def process_df(df):
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    df['Close_gap'] = df['Close'].pct_change()
    df['High_gap'] = df['High'].pct_change()
    df['Low_gap'] = df['Low'].pct_change()
    df['Volume_gap'] = df['Volume'].pct_change()
    df['Daily_change'] = (df['Close'] - df['Open']) / df['Open']
    df['Outcome_Next_Day_Direction'] = (df['Volume'].shift(-1) - df['Volume'])
    df = df[1:-1]

    df['Close_gap_LMH'] = pd.qcut(df['Close_gap'], 3, labels=["L", "M", "H"])

    # High_Gap - not used in this example
    df['High_gap_LMH'] = pd.qcut(df['High_gap'], 3, labels=["L", "M", "H"])

    # Low_Gap - not used in this example
    df['Low_gap_LMH'] = pd.qcut(df['Low_gap'], 3, labels=["L", "M", "H"])

    # Volume_Gap
    df['Volume_gap_LMH'] = pd.qcut(df['Volume_gap'], 3, labels=["L", "M", "H"])
    
    # Daily_Change
    df['Daily_change_LMH'] = pd.qcut(df['Daily_change'], 3, labels=["L", "M", "H"])

    # compressed_set = df[abs(df['Outcome_Next_Day_Direction']) > 10000000]
    df['Outcome_Next_Day_Direction'] = np.where((df['Outcome_Next_Day_Direction'] > 0), 1, 0)

    df['Event_Pattern'] = df['Close_gap_LMH'].astype(str) + df['Volume_gap_LMH'].astype(str) + df['Daily_change_LMH'].astype(str)
    df = df[['Date', 'Event_Pattern', 'Outcome_Next_Day_Direction']]

    df['Encode_Event'] = df['Event_Pattern'].map(lambda x: convert(x))
    df = df.reset_index()
    return df

### TRAIN/TEST SPLIT

In [5]:
train_df = df[:-500]
val_df = df[-500:]

train_df = process_df(train_df)
val_df = process_df(val_df)

train_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Close_gap'] = df['Close'].pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['High_gap'] = df['High'].pct_change()
A value is trying to be set on a copy of a slice from a DataFr

Unnamed: 0,index,Date,Event_Pattern,Outcome_Next_Day_Direction,Encode_Event
0,1,2017-01-04,MLM,1,10
1,2,2017-01-05,HHH,1,26
2,3,2017-01-06,HMH,0,23
3,4,2017-01-09,MLM,0,10
4,5,2017-01-10,MLM,1,10
5,6,2017-01-11,MHH,1,25
6,7,2017-01-12,HHH,0,26
7,8,2017-01-13,MLM,0,10
8,9,2017-01-17,LML,0,3
9,10,2017-01-18,MLM,1,10


### BUILD TRANSITION/EMISSION MATRIX

In [15]:
A = build_transition_matrix(train_df)
B = build_emission_matrix(train_df, 27)
initial_distribution = np.array((0.5, 0.5))

### RUN VITERBI AND ESTIMATE ACCURACY

In [7]:
V = val_df['Encode_Event'].to_numpy()
pred = viterbi(V, A, B, initial_distribution)
label = val_df['Outcome_Next_Day_Direction'].to_numpy()

print(f'Acc: {(pred == label).sum() / len(label) * 100} %')

Acc: 65.26104417670683 %
