# Predicting Parabolic Trend Breaks in Stocks

Overview: The samples include stock and cryptocurrency prices during parabolic upward trends. The goal is to predict the remaining days in such a parabolic phase of a sample. This number is described through the "Remaining Useful Life" (RUL). Besides this regression method, we can also derive labels and try to classifiy the samples.
* Label 0: Parabolic trend remains during the next 14 timestamps (days)
* Label 1: Parabolic trend breaks during the next 14 timestamps (days)

A sample includes: symbol, timestamp, open, high, low, close, volume
* Train Data: 16 stock samples (total of 37867 rows) during parabolic uptrend until trend break. 
* Test Data: 20 stock samples (total of 20 rows) during parabolic uptrend with known RUL for the last timestamp.


## Step 1: Loading and Plotting Data Samples

### Imports:

In [68]:
import pandas as pd
import os
import plotly.graph_objects as go
import numpy as np
from sklearn import preprocessing

### Define helper functions:

In [69]:
# function to load a dict of dataframes for files in 'service/data', also adds a %_change column for the stock
def build_dataframes(filenames):
    """Loads data of given filenames in dataframes and returns a dict of dataframes"""
    if filenames is None:
        raise ValueError('Argument can not be None')
    datasets = {}
    cur_path = os.getcwd() 
    base_data_path = os.path.abspath(os.path.join(cur_path, os.pardir, 'service/data'))
    for f in filenames:
        file_path = os.path.join(base_data_path, f)
        # load data set
        df = pd.read_csv(file_path, sep=',', header=None)
        df.columns = ['symbol', 'timestamp', 'open', 'high', 'low', 'close', 'volume']
        # add price change of candle
        df['%_change'] = ((df.close / df.open) - 1) * 100
        datasets[f.split(".")[0]] = df
        
    return datasets

In [70]:
# function to plot the time series using plotly
def plot_series(dataframe):
    """Plots a given dataframe"""
    if dataframe is None:
        raise ValueError('Argument can not be None')
    if 'timestamp' not in dataframe.columns or 'close' not in dataframe.columns:
        raise ValueError('Dataframe misses columns [\'timestamp\', \'close\']')
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x = pd.to_datetime(dataframe['timestamp'], unit='s'), y = dataframe['close']))
    fig.update_layout(
        title=str(dataframe['symbol'][0]),
        xaxis_rangeslider_visible=True,
        xaxis_title="Date",
        yaxis_title="Closing Price",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        )
    )
    fig.show()

In [71]:
# defining our train and test files
train_files = ['GE.csv','F.csv', 'BA.csv', 'T.csv', 'C.csv', 'MGM.csv', 'MRO.csv', 'APA.csv', 'DVN.csv', 'EOG.csv', 'CHK.csv', 'CHK2.csv', 'SWN.csv', 'RRC.csv', 'EQT.csv', 'COG.csv']

# the ordering of this files is important, since the separated RULs are in this order 
test_files = ['BTCUSDT.csv', 'ETHUSDT.csv', 'MS.csv', 'VZ.csv', 'KO.csv', 'LLY.csv', 'AMGN.csv', 'BIIB.csv', 'ALXN.csv', 'INCY.csv', 'BMRN.csv', 'IMGN.csv', 'ALNY.csv', 'RRC.csv', 'BP.csv', 'FCX.csv', 'VALE.csv', 'RIO.csv', 'MT.csv', 'PKX.csv']

In [72]:
# load train data
datasets_train = build_dataframes(train_files)

# plot train data samples
for key in datasets_train:
    plot_series(datasets_train[key])

In [73]:
# load test data
datasets_test = build_dataframes(test_files)

# plot test data samples
for key in datasets_test:
    plot_series(datasets_test[key])

## Step 2: Preparing the Data

### Feature Engineering:
#### Train data:

In [74]:
# size of rolling window
win = 7
# columns used for calculating rolling mean and std
cols = ['open', 'high', 'low', 'close', 'volume', '%_change']

In [75]:
# The last timestamp in a training sample marks the last day of useful life, so RUL = 0 in the last row
# Therefore we can derive the RUL for every row

# for every training sample: add RUL values, add label, add running mean, add running standard deviation
for key in datasets_train:
    data = datasets_train[key]
    # n_days is the number of days per stock
    n_days = data.shape[0]
    RULs = []
    
    # calculate RULs: n_days - i, when i is the count of iterated days
    for i in range(1, n_days+1):
        RULs.append(n_days-i)
    # add RULs to data
    data['RUL'] = RULs
    
    # calculate running means and stds using a window technique
    running_means = data[cols].rolling(window = win, min_periods = 1).mean()
    running_stds = data[cols].rolling(window = win, min_periods = 1).std().fillna(0)

    # rename the columns of the new dataframes
    running_means.columns = [x + '_rm' for x in cols]
    running_stds.columns = [x + '_rstd' for x in cols]
    
    # join the new columns with the data set
    data = pd.concat([data, running_means.reset_index(drop = True), running_stds.reset_index(drop = True)], axis = 1)
    
    # add label 0 or 1 depending on RUL 
    data['label'] = [0 if val > 14 else 1 for val in data['RUL']]
    
    # reassign changed dataframe
    datasets_train[key] = data

In [76]:
# Scaling: to keep the datasets comparable (high value differences), scale each set separately

# for every training sample: scale features by standardizing
for key in datasets_train:
    
    data = datasets_train[key]
    
    # define features to be scaled
    to_be_scaled = [x for x in data.columns if x not in['symbol', 'RUL', 'label']]
    
    # define a scaler for standardizing(->remove mean and scale to unit variance)
    scaler = preprocessing.StandardScaler()
    
    # fit and transform the data set
    scaled_features = scaler.fit_transform(data[to_be_scaled])
    
    # transform scaled data to dataframe
    df = pd.DataFrame(scaled_features, columns=to_be_scaled)
    
    # join scaled data with non scaled features of data set
    datasets_train[key] = pd.concat([df, data.RUL, data.label], axis = 1)

In [77]:
# join datasets together in one training set
train = pd.DataFrame()
for key in datasets_train:
    train = train.append(datasets_train[key], ignore_index = True)
    
# save training set
train.to_csv('train.csv')

#### Test data:

In [78]:
# Load RULs for test samples
RULs = []
cur_path = os.getcwd() 
base_data_path = os.path.abspath(os.path.join(cur_path, os.pardir, 'service/data'))
file_path = os.path.join(base_data_path, 'RULs.txt')
f = open(file_path, "r")
for row in f:
    RULs.append(int(row.strip('\n')))

In [79]:
# for every test sample: add running mean, add running standard deviation
for key in datasets_test:
    data = datasets_test[key]

    # calculate running means and stds using a window technique
    running_means = data[cols].rolling(window = win, min_periods = 1).mean()
    running_stds = data[cols].rolling(window = win, min_periods = 1).std().fillna(0)

    # rename the columns of the new dataframes
    running_means.columns = [x + '_rm' for x in cols]
    running_stds.columns = [x + '_rstd' for x in cols]
    
    # join the new columns with the data set
    data = pd.concat([data, running_means.reset_index(drop = True), running_stds.reset_index(drop = True)], axis = 1)
    
    # reassign changed dataframe
    datasets_test[key] = data

In [80]:
# for every test sample: scale features by standardizing, add RUL values, add label
idx = 0
for key in datasets_test:
    data = datasets_test[key]
    # define features to be scaled
    to_be_scaled = [x for x in data.columns if x not in['symbol', 'RUL', 'label']]
    
    # define a scaler for standardizing(->remove mean and scale to unit variance)
    scaler = preprocessing.StandardScaler()
    
    # fit and transform the data set
    scaled_features = scaler.fit_transform(data[to_be_scaled])

    # transform scaled data to dataframe
    df = pd.DataFrame(scaled_features, columns=to_be_scaled)
    
    # extract the last row for the known RUL
    df = df.tail(1)
    
    # add RUL and label to scaled dataframe
    df['RUL'] = RULs[idx]
    df['label'] = [0 if val > 14 else 1 for val in df['RUL']]
    
    # reassign changed dataframe
    datasets_test[key] = df
    idx += 1

In [81]:
# join datasets together in one test set
test = pd.DataFrame()
for key in datasets_test:
    test = test.append(datasets_test[key], ignore_index = True)
    
# save test set
test.to_csv('test.csv')

## Step 3: Train and test models

### Imports

In [112]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import LinearRegression

### Data Loading

In [83]:
train = pd.read_csv('train.csv', header = 0, index_col = 0, sep=",", low_memory=False)
test = pd.read_csv('test.csv', header = 0, index_col = 0, sep=",", low_memory=False)

In [84]:
train

Unnamed: 0,timestamp,open,high,low,close,volume,%_change,open_rm,high_rm,low_rm,...,volume_rm,%_change_rm,open_rstd,high_rstd,low_rstd,close_rstd,volume_rstd,%_change_rstd,RUL,label
0,-1.729347,-0.896124,-0.894863,-0.897456,-0.896025,-1.353518,-0.064687,-0.895426,-0.894076,-0.896771,...,-2.038523,-0.181335,-0.772262,-0.765185,-0.752773,-0.768926,-1.615922,-2.056196,2956,0
1,-1.728537,-0.896830,-0.894863,-0.896739,-0.895319,-0.694639,0.330486,-0.895780,-0.894076,-0.896411,...,-1.541658,0.367420,-0.753179,-0.765185,-0.733846,-0.750199,-0.519517,-1.421798,2955,0
2,-1.727726,-0.896124,-0.894167,-0.898180,-0.896730,-0.482245,-0.261727,-0.895662,-0.893844,-0.896773,...,-1.269258,0.002089,-0.756680,-0.748012,-0.725872,-0.742442,-0.546802,-1.371509,2954,0
3,-1.726916,-0.897537,-0.895558,-0.898898,-0.895319,-0.141486,0.529721,-0.895957,-0.894076,-0.897134,...,-1.004573,0.368944,-0.746423,-0.740899,-0.718079,-0.743569,-0.414141,-1.236277,2953,0
4,-1.726105,-0.891172,-0.885806,-0.892429,-0.885428,2.487850,1.483588,-0.894857,-0.892260,-0.896053,...,-0.052642,1.118889,-0.675967,-0.590725,-0.657930,-0.592619,1.864632,-0.512379,2952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37860,1.725632,3.009594,2.982472,2.875766,2.883683,2.313813,-1.426019,3.241107,3.252201,3.173267,...,3.264041,-1.541419,3.208348,4.485728,3.590762,3.718949,0.509628,1.095048,4,1
37861,1.727100,2.804093,2.763568,2.511588,2.782265,5.310443,-0.282006,3.194883,3.205784,3.094621,...,3.729432,-1.557606,4.863791,6.089195,6.825880,5.083758,1.735959,1.090257,3,1
37862,1.728569,2.792677,2.821616,2.650983,2.598715,3.079850,-2.289318,3.153488,3.162589,3.025320,...,3.865624,-2.669883,5.946707,6.917594,7.754744,6.948028,1.709510,1.080860,2,1
37863,1.730037,2.601447,2.759372,2.678716,2.842259,1.823507,2.910129,3.052720,3.083045,2.931448,...,3.866999,-1.672021,7.246839,7.352417,7.276411,6.268866,1.707359,2.797473,1,1


### Classification
Classify test data: Predict label to determine if parabolic trend remains during the next 14 timestamps (days) or not

In [85]:
X_train = train.drop(['RUL', 'label'], axis = 1)
y_train = train['label']

In [86]:
X_test = test.drop(['RUL', 'label'], axis = 1)
y_test = test['label']

In [87]:
y_test

0     1
1     0
2     0
3     1
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    1
14    1
15    1
16    1
17    1
18    1
19    1
Name: label, dtype: int64

#### K-Nearest Neighbors (KNN)

In [101]:
# fit model
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)

# evaluate model
prediction = knn_clf.predict(X_test)
knn_score = accuracy_score(prediction, y_test)
print(f"Accuracy for knn classifier: {knn_score}")

Accuracy for knn classifier: 0.6


#### C-Support Vector Classification (SVC)

In [102]:
# fit model
svc_clf = SVC()
svc_clf.fit(X_train, y_train)

# evaluate model
prediction = svc_clf.predict(X_test)
svc_score = accuracy_score(prediction, y_test)
print(f"Accuracy for svc classifier: {svc_score}")

Accuracy for svc classifier: 0.55


#### Neural Network: MLPClassifier (MLP)

In [103]:
# fit model
mlp_clf = MLPClassifier(alpha=1, max_iter=1000)
mlp_clf.fit(X_train, y_train)

# evaluate model
prediction = mlp_clf.predict(X_test)
mlp_score = accuracy_score(prediction, y_test)
print(f"Accuracy for mlp classifier: {mlp_score}")

Accuracy for mlp classifier: 0.55


### Regression
Regression for test data: Predict RUL to determine how long parabolic trend remains

In [113]:
y_train = train['RUL']
y_test = test['RUL']

In [116]:
# fit model
linear_reg = LinearRegression(fit_intercept=True)
linear_reg.fit(X_train, y_train)

# evaluate model (root-mean-square error)
prediction = linear_reg.predict(X_test)
linear_reg_rmse = sqrt(mean_squared_error(prediction, y_test))
print(f"RMSE for linear regressor: {linear_reg_rmse}")

RMSE for linear regressor: 315.3246908830436
