### This notebook prepares the downloaded historical stock data and calculates futures necessary to run predictions

Import libraries

In [1]:
import pandas as pd

Select the file 

In [123]:
# this will be refactored in favour of the dropdown menu
from os import listdir
downloaded_files = listdir('/data/')
print(downloaded_files)
selected_file = 'download_20221125_133248.csv'

# read the file into a pandas DataFrame
df_downloaded = pd.read_csv(f'/data/{selected_file}')
df_downloaded.shape

['download_20221125_133248.csv', 'download_20221125_124002.csv', 'download_20221125_125436.csv', 'download_20221125_125439.csv', 'download_20221125_125442.csv']


(503, 9)

Model target and features

| Target | Description |
| :--- | :--- |
| `adj_close_diff` | difference between the adjusted closing price of the stock compared to the price the day before |


| Feature | Description |
| :--- | :--- |
| `highest_5`| Is the price highest in 5 days? |
| `highest_10`| Is the price highest in 10 days? |
| `highest_ever`| Is the price highest ever seen? |
| `is_monday`| Is it a Monday? |
| `is_friday`| Is it a Friday? |
| `decline_duration`| How many days has the price has been in decline for? Can be <0 if the price has been increasing|
| `previous_swing`| Difference between the high and low price for the previous day|

First thing we need to reshape the dataset to contain a record for each day, e.g. weekends and public holidays included

In [124]:
# we need to do this separately for training and validation
df_downloaded['date'] = pd.to_datetime(df_downloaded['Date'])
_data = []

for subset in ['training', 'validation']:    

    
    # get the minimum and maximum years from the downloaded dataframe
    subset_df = (
        df_downloaded
        .query(f'subset == "{subset}"')
        .reset_index(drop=True)
    )
    
    min_date = pd.to_datetime(subset_df['Date']).min().replace(day=1)
    max_date = month_end(pd.to_datetime(subset_df['Date']).max())
    
    new_dates = pd.DataFrame(
        {
            'date' : pd.date_range(start=min_date, end=max_date, freq='D').to_series().reset_index(drop=True)
        })
    
    subset_df = (
        df_downloaded
        .merge(new_dates, how='right', left_on='date', right_on='date')
        .assign(subset=subset)
        .ffill(axis=0)
        .bfill(axis=0)
    )
    _data.append(subset_df.copy())

# replace the downloaded dataframe with the upsampled one
df_downloaded = pd.concat(_data, ignore_index=True)

In [126]:
df_downloaded.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,stock_code,subset,date
0,2020-01-02,324.980011,326.320007,323.950012,326.320007,312.237305,4070500.0,IVV,training,2020-01-01
1,2020-01-02,324.980011,326.320007,323.950012,326.320007,312.237305,4070500.0,IVV,training,2020-01-02
2,2020-01-03,322.529999,325.059998,322.51001,323.809998,309.835541,4290400.0,IVV,training,2020-01-03
3,2020-01-03,322.529999,325.059998,322.51001,323.809998,309.835541,4290400.0,IVV,training,2020-01-04
4,2020-01-03,322.529999,325.059998,322.51001,323.809998,309.835541,4290400.0,IVV,training,2020-01-05


In [72]:
def month_end(dt):
    '''Convert the date to end of the month'''
    return (dt.replace(day=28) + timedelta(days=4)).replace(day=1) - timedelta(days=1)

In [2]:
from os import listdir, getcwd

print(getcwd())

/notebooks


In [8]:
listdir('/data')

['download_20221125_124002.csv',
 'download_20221125_125436.csv',
 'download_20221125_125439.csv',
 'download_20221125_125442.csv']

In [9]:
test = pd.read_csv('/data/download_20221125_125442.csv')
test.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,stock_code,subset
0,2010-01-04,63.0,63.32,62.919998,62.93,62.93,24802,VAS.AX,training
1,2010-01-05,63.700001,63.75,63.470001,63.580002,63.580002,10083,VAS.AX,training
2,2010-01-06,63.77,63.77,63.400002,63.490002,63.490002,8573,VAS.AX,training
3,2010-01-07,63.73,63.73,63.299999,63.299999,63.299999,4932,VAS.AX,training
4,2010-01-08,63.5,63.689999,63.349998,63.389999,63.389999,5223,VAS.AX,training


In [43]:
from datetime import datetime

date1 = datetime.strptime('2010-01-01', '%Y-%m-%d').date()
date2 = datetime.strptime('2010-12-31', '%Y-%m-%d').date()

index = pd.date_range(start=date1, end=date2, freq='D').to_series().reset_index(drop=True)
print(index)

0     2010-01-01
1     2010-01-02
2     2010-01-03
3     2010-01-04
4     2010-01-05
         ...    
360   2010-12-27
361   2010-12-28
362   2010-12-29
363   2010-12-30
364   2010-12-31
Length: 365, dtype: datetime64[ns]
