In [1]:
import os
import glob
import pickle
from tabulate import tabulate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## Adding different features into the model to check on the different accuracy

In [1]:
# Squeeze Momentum indicator

In [None]:
# This function encompass everything from the data engineering to model training to give an evaluation of the results
# Input different parameters values to test out the optimize the model with different parameters.
def run()

In [139]:
# We should seperate the DF feed into the model out of the model training itself so we can customize the DF. 
def data_preprocessing(file_path, n_days = -1, n_pred = 1):
    """
    n_days: if n_days is smaller than 0, it will read in the whole dataset. If not, it read in the last n_days value
    n_pred: last n_pred days of resulting df to make prediction in stock prices trend
    
    """
    if n_days < 0:
        df = pd.read_csv(file_path)
    else:
        df = pd.read_csv(file_path)[-n_days:]
    
    # Number of days to construct RSI
    window_length = 14
    df['Date'] = pd.to_datetime(df['Date'])
#     df['Year'] = df['Year'].dt.year
     
    ## Data Preprocessing
    # Prediction variable y
    df[f'{n_pred}-day-Returns'] = df['Close'].pct_change(n_pred)
    df['Returns'] = df['Close'].pct_change(1)
    
    # Gains/loss/RSI
    price_change = df['Close'].diff()
    gain = price_change.clip(lower=0)
    loss = -1 * price_change.clip(upper=0)

    # RSI
    avg_gain = gain.rolling(window=window_length).mean()
    avg_loss = loss.rolling(window=window_length).mean()
    rs = avg_gain / avg_loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # MACD computation
    ema_12 = df["Close"].ewm(span=12, adjust=False).mean()
    ema_26 = df["Close"].ewm(span=26, adjust=False).mean()
    macd = ema_12 - ema_26
    signal_line = macd.ewm(span= 9, adjust = False).mean()
    df['MACD Indicator'] = macd - signal_line 
    
    return df

# df = data_preprocessing("Data/ge.us.txt", n_pred = 5)    
# df.dropna(inplace = True)
# df

In [140]:
# FEATURE SELECTION 
# Summarization of features excluding the last n_days data point to prevent data leakage 
n_pred = 5
volatility = lambda df: df['Returns'][:-n_pred].std()
last_rsi = lambda df: df['RSI'].values[-n_pred - 1]
rsi_std = lambda df: df['RSI'][:-n_pred].std()
last_macd = lambda df: df["MACD Indicator"].values[- n_pred - 1]

# Increment the mean volume by 0.5 to prevent the occurence of logging 0 resulting in -infinity value
log_mean_volume = lambda df: np.log10(df['Volume'].mean() + 0.5)

features = {
    f'{n_pred}-day-Trend': n_day_trend,
    'Volatility': volatility,
    'Last RSI': last_rsi,
    'RSI std': rsi_std,
    'Log Mean Volume': log_mean_volume,
    'Last MACD': last_macd
}

In [141]:
## Data Engineering (Using domain knowledge to summarize the data)
def data_engineering(df, features, n_pred = 1, params = None):
    """
    Utilizing RSI mean and RSI std should be more careful about its context, using the single latest RSI value should be more
    representative of the current stock's bullish / beamish state. But maybe to represent stocks in a long term state (how
    volatile it is or how has it been performing in the past year) would be more helpful for us to cluster the stocks together

    Feature documentation:
    RSI mean: RSI (ranges from 0 - 100) indicates if a particular asset is overbought (RSI > 70) / oversold ( < 30), taking the
              mean over a period of time meaning the tendency of this stocks being overbought/oversold over a long period of time
    RSI     : Usually we can just consider the immediate RSI, which is more representative of current stocks
    RSI std : Help us to gauge how this stocks will go over to overbought/oversold condition

    """
    trend = np.sign(df[f"{n_pred}-day-Returns"].values[-1])
    if trend == 0:
        return None
    else:
        res = {f'{n_pred}-day-Trend': trend}
        for name, func in features.items():
            res[name] = func(df)
        return res

# feat = data_engineering(df, features, n_pred = 5)
# feat

In [167]:
# Ulterior motivation: To check if running the classification model using different time frame of a stock has better  
# prediction on the trend than running the classificadtion model accross different assets 
def single_asset_summarization(df, stride, features, n_days = 22, n_pred = 1):
    
    """
    Going through the datasets with a step size of stride to produce a summarization of the datasets within a fixed window
    size defined by n_days 
    
    df    : The dataframe contains of a single stock prices data (Take in dataframe produce by data_preprocessing)
    stride: The step size to move downwards from each data point. Similar to the stride in CNN
    n_days: Number of days for each sliced dataframe
    n_pred: Number of days to predict the trend
    """
    res = []
    n = len(df)
    for i in range(0, n - n_days - n_pred + 1, stride):
        df_slice = df[i:i + n_days + n_pred]
        feat = data_engineering(df_slice, features, n_pred = n_pred)
        if feat is None:
            continue
        else:
            res.append(feat)
    df_summ = pd.DataFrame(res)
    return df_summ

# single_df = single_asset_summarization(df, 1, features, n_days = 27, n_pred = 5)
# single_df

In [143]:
# Produce a dataframe which summarize each assets last n_days to predict trend
def all_asset_summarization(features, n_days = 22, n_pred = 1):
    res = []
    for file in glob.glob("Data/*.txt"): 
        df = data_preprocessing(file, n_days, n_pred)
        feat = data_engineering(df, features, n_pred)
        if feat is None:
            continue
        else:
            res.append(feat)
    df_summ = pd.DataFrame(res)
    return df_summ

all_df = all_asset_summarization(features, n_days = 27, n_pred = 5)
all_df

Unnamed: 0,5-day-Trend,Volatility,Last RSI,RSI std,Log Mean Volume,Last MACD
0,-1.0,0.006034,66.294643,4.048011,6.151373,0.089824
1,-1.0,0.016305,43.721973,5.157569,6.560383,-0.098955
2,-1.0,0.035857,69.177289,9.215017,6.036254,1.030656
3,1.0,0.013064,60.398860,10.964631,6.766597,0.414258
4,1.0,0.083218,70.791075,12.619119,5.281896,0.205592
...,...,...,...,...,...,...
7402,1.0,0.004531,64.364641,7.519305,4.961603,0.023851
7403,1.0,0.012209,65.686275,7.850533,6.429631,0.274435
7404,1.0,0.022752,62.162162,8.789578,5.580780,0.037808
7405,1.0,0.018891,22.288262,5.683562,4.421997,-0.012094


In [144]:
def run_svm(X_train, X_test, y_train, y_test):
    svm = make_pipeline(StandardScaler(), SVC())
    svm.fit(X_train, y_train)
    pred = svm.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    ## Note: The first row and first column corresponds to all the ground truth of -1 and predicted value of -1
    cm = confusion_matrix(y_test, pred)
    return accuracy, cm

In [162]:
def res_summary(H0, alt):
    """
    H0 : A dictionary where it has the key value pair of "Evaluation Metrics": list 
         where each list contains all the results of all the run
    alt: Same as H0 but with alternative hypothesis
    """
    full_headers = []
    full_res = []
    mean_headers = ["Eval Metrics","H0", "alt"]
    mean_res = []
    for metric in H0.keys():
        full_res.append(H0[metric])
        full_res.append(alt[metric])
        full_headers.append(f"H0_{metric}")
        full_headers.append(f"alt_{metric}")
        
        if isinstance(H0[metric][0], int):
            mean_res.append([metric, np.mean(H0[metric]), np.mean(alt[metric])])
        else:
            # Convert arrays to strings for tabulate compatibility
            mean_H0 = np.mean(H0[metric], axis=0)
            mean_alt = np.mean(alt[metric], axis=0)
            mean_res.append([metric, str(mean_H0), str(mean_alt)])

    mean_res = [list(map(row, list)) if isinstance(row, np.ndarray) else row for row in mean_res]
    print(mean_res)
    transposed_full_res = list(map(list, zip(*full_res))) 
    print(transposed_full_res)
    print(tabulate(transposed_full_res, headers = full_headers, tablefmt = "fancy_grid"))
    print(tabulate(mean_res, headers = mean_headers, tablefmt = "fancy_grid"))
        

In [164]:
# H0: Single asset model will have better performance in terms of predicting its own price trend in the future

# FEATURE SELECTION 
n_pred = 5
volatility = lambda df: df['Returns'][:-n_pred].std()
last_rsi = lambda df: df['RSI'].values[-n_pred - 1]
rsi_std = lambda df: df['RSI'][:-n_pred].std()
last_macd = lambda df: df["MACD Indicator"].values[- n_pred - 1]

# Increment the mean volume by 0.5 to prevent the occurence of logging 0 resulting in -infinity value
log_mean_volume = lambda df: np.log10(df['Volume'].mean() + 0.5)

features = {
    f'{n_pred}-day-Trend': n_day_trend,
    'Volatility': volatility,
    'Last RSI': last_rsi,
    'RSI std': rsi_std,
    'Log Mean Volume': log_mean_volume,
    'Last MACD': last_macd
}

# Running the model on different stock prices datasets to ensure the rigorisity of the testing
counter = 1
H0 = {"Accuracy":[], "Confusion Matrix":[]}
alt = {"Accuracy":[], "Confusion Matrix":[]}

for file in glob.glob("Data/*.txt"):
    if counter > 10:
        break
    ## Set up for model running
    all_df = all_asset_summarization(features, n_days = 27, n_pred = n_pred)
    df = data_preprocessing(file, n_pred = 5)
    single_df = single_asset_summarization(df, 1, features, n_days = 27, n_pred = n_pred)
    
    
    ## Running the SVM model using the stock prices summarization from a single dataset 
    X = single_df.drop([f"{n_pred}-day-Trend"], axis = 1)
    y = single_df[f"{n_pred}-day-Trend"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, shuffle = False)
    single_accuracy, single_cm = run_svm(X_train, X_test, y_train, y_test)
    H0["Accuracy"].append(single_accuracy)
    H0["Confusion Matrix"].append(single_cm)

    ## Running the SVM model using stock prices summarization from all the assets
    X_train = all_df.drop([f"{n_pred}-day-Trend"], axis = 1)
    y_train = all_df[f"{n_pred}-day-Trend"]
    all_accuracy, all_cm = run_svm(X_train, X_test, y_train, y_test)
    alt["Accuracy"].append(all_accuracy)
    alt["Confusion Matrix"].append(all_cm)
    counter += 1 
    
    
res_summary(H0, alt)

KeyboardInterrupt: 

In [None]:
# H0: Single asset model will have better performance in terms of predicting its own price trend in the future

# FEATURE SELECTION 
n_pred = 3
volatility = lambda df: df['Returns'][:-n_pred].std()
last_rsi = lambda df: df['RSI'].values[-n_pred - 1]
rsi_std = lambda df: df['RSI'][:-n_pred].std()
last_macd = lambda df: df["MACD Indicator"].values[- n_pred - 1]

# Increment the mean volume by 0.5 to prevent the occurence of logging 0 resulting in -infinity value
log_mean_volume = lambda df: np.log10(df['Volume'].mean() + 0.5)

features = {
    f'{n_pred}-day-Trend': n_day_trend,
    'Volatility': volatility,
    'Last RSI': last_rsi,
    'RSI std': rsi_std,
    'Log Mean Volume': log_mean_volume,
    'Last MACD': last_macd
}

# Running the model on different stock prices datasets to ensure the rigorisity of the testing
counter = 1
H0 = {"Accuracy":[], "Confusion Matrix":[]}
alt = {"Accuracy":[], "Confusion Matrix":[]}

for file in glob.glob("Data/*.txt"):
    if counter > 10:
        break
    ## Set up for model running
    all_df = all_asset_summarization(features, n_days = 27, n_pred = n_pred)
    df = data_preprocessing(file, n_pred = 5)
    single_df = single_asset_summarization(df, 1, features, n_days = 27, n_pred = n_pred)
    
    
    ## Running the SVM model using the stock prices summarization from a single dataset 
    X = single_df.drop([f"{n_pred}-day-Trend"], axis = 1)
    y = single_df[f"{n_pred}-day-Trend"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, shuffle = False)
    single_accuracy, single_cm = run_svm(X_train, X_test, y_train, y_test)
    H0["Accuracy"].append(single_accuracy)
    H0["Confusion Matrix"].append(single_cm)

    ## Running the SVM model using stock prices summarization from all the assets
    X_train = all_df.drop([f"{n_pred}-day-Trend"], axis = 1)
    y_train = all_df[f"{n_pred}-day-Trend"]
    all_accuracy, all_cm = run_svm(X_train, X_test, y_train, y_test)
    alt["Accuracy"].append(all_accuracy)
    alt["Confusion Matrix"].append(all_cm)
    counter += 1 
    
    
res_summary(H0, alt)

In [168]:
for i in range(1, 8):
    n_pred = i
    volatility = lambda df: df['Returns'][:-n_pred].std()
    last_rsi = lambda df: df['RSI'].values[-n_pred - 1]
    rsi_std = lambda df: df['RSI'][:-n_pred].std()
    last_macd = lambda df: df["MACD Indicator"].values[- n_pred - 1]

    # Increment the mean volume by 0.5 to prevent the occurence of logging 0 resulting in -infinity value
    log_mean_volume = lambda df: np.log10(df['Volume'].mean() + 0.5)

    features = {
        f'{n_pred}-day-Trend': n_day_trend,
        'Volatility': volatility,
        'Last RSI': last_rsi,
        'RSI std': rsi_std,
        'Log Mean Volume': log_mean_volume,
        'Last MACD': last_macd
    }
    counter = 1
    H0 = {"Accuracy":[], "Confusion Matrix":[]}
    alt = {"Accuracy":[], "Confusion Matrix":[]}

    for file in glob.glob("Data/*.txt"):
        if counter > 5:
            break
        ## Set up for model running
        all_df = all_asset_summarization(features, n_days = 27, n_pred = n_pred)
        df = data_preprocessing(file, n_pred = n_pred)
        single_df = single_asset_summarization(df, 1, features, n_days = 27, n_pred = n_pred)


        ## Running the SVM model using the stock prices summarization from a single dataset 
        X = single_df.drop([f"{n_pred}-day-Trend"], axis = 1)
        y = single_df[f"{n_pred}-day-Trend"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, shuffle = False)
        single_accuracy, single_cm = run_svm(X_train, X_test, y_train, y_test)
        H0["Accuracy"].append(single_accuracy)
        H0["Confusion Matrix"].append(single_cm)

        ## Running the SVM model using stock prices summarization from all the assets
        X_train = all_df.drop([f"{n_pred}-day-Trend"], axis = 1)
        y_train = all_df[f"{n_pred}-day-Trend"]
        all_accuracy, all_cm = run_svm(X_train, X_test, y_train, y_test)
        alt["Accuracy"].append(all_accuracy)
        alt["Confusion Matrix"].append(all_cm)
        counter += 1 

    print(f"----------------------------------------n_pred = {i}-----------------------------------------")
    print(res_summary(H0, alt))

----------------------------------------n_pred = 1-----------------------------------------
[['Accuracy', '0.5205765524234793', '0.5047426784404683'], ['Confusion Matrix', '[[142.  284.2]\n [142.  311. ]]', '[[311.  115.2]\n [325.8 127.2]]']]
[[0.5157303370786517, 0.4797752808988764, array([[ 36, 383],
       [ 48, 423]], dtype=int64), array([[376,  43],
       [420,  51]], dtype=int64)], [0.5195632393084623, 0.5077343039126478, array([[441, 656],
       [400, 701]], dtype=int64), array([[697, 400],
       [682, 419]], dtype=int64)], [0.5638297872340425, 0.48936170212765956, array([[11, 27],
       [14, 42]], dtype=int64), array([[31,  7],
       [41, 15]], dtype=int64)], [0.5037593984962406, 0.4868421052631579, array([[177, 323],
       [205, 359]], dtype=int64), array([[399, 101],
       [445, 119]], dtype=int64)], [0.5, 0.56, array([[45, 32],
       [43, 30]], dtype=int64), array([[52, 25],
       [41, 32]], dtype=int64)]]
╒═══════════════╤════════════════╤═══════════════════════╤══

----------------------------------------n_pred = 4-----------------------------------------
[['Accuracy', '0.5124371137663661', '0.42943323113778964'], ['Confusion Matrix', '[[126.  294.8]\n [148.  337.6]]', '[[417.8   3. ]\n [484.    1.6]]']]
[[0.5536912751677853, 0.42841163310961966, array([[ 12, 371],
       [ 28, 483]], dtype=int64), array([[383,   0],
       [511,   0]], dtype=int64)], [0.5159620362381363, 0.48921484037963764, array([[470, 674],
       [448, 726]], dtype=int64), array([[1131,   13],
       [1171,    3]], dtype=int64)], [0.5473684210526316, 0.2631578947368421, array([[ 6, 19],
       [24, 46]], dtype=int64), array([[25,  0],
       [70,  0]], dtype=int64)], [0.46834264432029793, 0.44320297951582865, array([[111, 365],
       [206, 392]], dtype=int64), array([[476,   0],
       [598,   0]], dtype=int64)], [0.4768211920529801, 0.5231788079470199, array([[31, 45],
       [34, 41]], dtype=int64), array([[74,  2],
       [70,  5]], dtype=int64)]]
╒═══════════════╤══════

----------------------------------------n_pred = 7-----------------------------------------
[['Accuracy', '0.5428967992494784', '0.43497667719436856'], ['Confusion Matrix', '[[101.2 322.2]\n [123.6 364.4]]', '[[ 89.8 333.6]\n [135.  353. ]]']]
[[0.5776536312849162, 0.45027932960893857, array([[ 51, 335],
       [ 43, 466]], dtype=int64), array([[119, 267],
       [225, 284]], dtype=int64)], [0.4957374254049446, 0.49701619778346123, array([[306, 864],
       [319, 857]], dtype=int64), array([[174, 996],
       [184, 992]], dtype=int64)], [0.5957446808510638, 0.20212765957446807, array([[10,  9],
       [29, 46]], dtype=int64), array([[19,  0],
       [75,  0]], dtype=int64)], [0.47201492537313433, 0.5121268656716418, array([[ 93, 372],
       [194, 413]], dtype=int64), array([[ 60, 405],
       [118, 489]], dtype=int64)], [0.5733333333333334, 0.5133333333333333, array([[46, 31],
       [33, 40]], dtype=int64), array([[77,  0],
       [73,  0]], dtype=int64)]]
╒═══════════════╤══════════

In [93]:
(alt["Confusion Matrix"])

166.21428571428572

1


In [145]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,5-day-Returns,Returns,RSI,MACD Indicator
0,2005-02-25,12.920,13.470,12.920,13.295,933743,0,,,,0.000000
1,2005-02-28,13.371,13.470,13.118,13.381,683911,0,,0.006469,,0.005488
2,2005-03-01,13.371,13.794,13.342,13.734,624585,0,,0.026381,,0.031156
3,2005-03-02,13.607,14.038,13.568,13.822,387289,0,,0.006407,,0.051143
4,2005-03-03,13.891,13.891,13.450,13.627,362061,0,,-0.014108,,0.048345
...,...,...,...,...,...,...,...,...,...,...,...
3196,2017-11-06,36.280,36.570,35.570,35.590,1029506,0,-0.039924,-0.023058,22.304833,-0.663002
3197,2017-11-07,35.720,35.720,34.290,34.440,1149793,0,-0.064130,-0.032312,20.636285,-0.671422
3198,2017-11-08,35.510,36.090,34.800,35.990,1805497,0,0.001670,0.045006,22.890560,-0.531374
3199,2017-11-09,35.680,35.860,35.230,35.620,979951,0,-0.005028,-0.010281,20.529801,-0.426931


In [151]:
print(type([5]) == list)

True
