### This notebook prepares the downloaded historical stock data and calculates futures necessary to run predictions

Import libraries

In [None]:
import sys
import pandas as pd
sys.path.append('/helpers')
from dates import month_end

Import the widgets

In [None]:
from notebook_widgets.data_prep import selectors
# display widgets
for s in selectors:
    display(selectors[s]['display'])

Import the selected download

In [None]:
selected_download = (
    pd.read_csv('/data/' + selectors['file']['display'].value + '.csv', parse_dates=['Date'])
    .query('stock_code == "{}"'.format(selectors['stock_code']['display'].value))
    .reset_index(drop=True)
)

# display number of records
print('Number of records: {}'.format(selected_download.shape[0]))

Let's reshape the dataset to contain a record for each day, e.g. weekends and public holidays included

In [None]:
_data = []

for subset in ['training', 'validation']: 

    # get the minimum and maximum years from the downloaded dataframe
    subset_df = (
        selected_download
        .query(f'subset == "{subset}"')
        .reset_index(drop=True)
    )
    
    min_date = pd.to_datetime(subset_df['Date']).min().replace(day=1)
    max_date = month_end(pd.to_datetime(subset_df['Date']).max())
    
    new_dates = pd.DataFrame(
        {
            'Date' : pd.date_range(start=min_date, end=max_date, freq='D').to_series().reset_index(drop=True)
        })
    
    subset_df = (
        selected_download
        .merge(new_dates, how='right', left_on='Date', right_on='Date')
        .assign(subset=subset)
        .ffill(axis=0)
        .bfill(axis=0)
    )
    _data.append(subset_df.copy())

# replace the downloaded dataframe with the upsampled one
selected_download = pd.concat(_data, ignore_index=True)

# display number of records
print('Number of records: {}'.format(selected_download.shape[0]))

Model target and features

| Target | Description |
| :--- | :--- |
| `adj_close_next_day` | adjusted closing price increase on the next day |


| Feature | Description |
| :--- | :--- |
| `highest_5`| Is the price highest in 5 days? |
| `highest_10`| Is the price highest in 10 days? |
| `highest_30`| Is the price highest in 30 days? |
| `highest_ever`| Is the price highest ever seen? |
| `is_monday`| Is it a Monday? |
| `is_friday`| Is it a Friday? |
| `decline_duration`| How many days has the price has been in decline for? Can be <0 if the price has been increasing|
| `previous_swing`| Difference between the high and low price for the previous day|

We start by calculating the target `adj_close_next_day` first

In [None]:
selected_download = (
    selected_download
    .assign(adj_close_next_day = 1 / (selected_download['Adj Close'].pct_change(periods=-1) + 1))
)

Calculate `1 or 0` flag if the price is the highest in 5, 10 and 30 days, e.g. `highest_5`, `highest_10` and `highest_30`

Calculate `1 or 0` flag if the price is the highest ever or cumulative max, e.g. `highest_ever`

In [None]:
from dataframes import rolling_max, cumulative_max

selected_download = (
    selected_download
    .pipe(rolling_max, 'Adj Close', 5)
    .pipe(rolling_max, 'Adj Close', 10)
    .pipe(rolling_max, 'Adj Close', 30)
    .pipe(cumulative_max, 'Adj Close')
)

Calculate date features `is_monday` and `is_friday`

In [None]:
selected_download = (
    selected_download
    .assign(is_monday=selected_download['Date'].dt.weekday.apply(lambda ser : 1 if ser == 0 else 0))
    .assign(is_friday=selected_download['Date'].dt.weekday.apply(lambda ser : 1 if ser == 4 else 0))    
)

Calculate date increase and decline streaks, e.g. number of days the price has been increasing or declining. 

In [None]:
from dataframes import increase_decline_streak
selected_download = (
    selected_download
    .pipe(increase_decline_streak, 'Adj Close')
)