In [1]:
pip install -r requirements.txt


Collecting utils (from -r requirements.txt (line 3))
  Downloading utils-1.0.1-py2.py3-none-any.whl (21 kB)
Installing collected packages: utils
Successfully installed utils-1.0.1
Note: you may need to restart the kernel to use updated packages.


Explore stock market dataset from Yahoo Finance

In [None]:
import yfinance as yf

msft = yf.Ticker("MSFT")
print(msft)
"""
returns
<yfinance.Ticker object at 0x1a1715e898>
"""


In [None]:
import json

# get stock info
print(json.dumps(msft.info, indent=2))

"""
returns:
{
 'quoteType': 'EQUITY',
 'quoteSourceName': 'Nasdaq Real Time Price',
 'currency': 'USD',
 'shortName': 'Microsoft Corporation',
 'exchangeTimezoneName': 'America/New_York',
  ...
 'symbol': 'MSFT'
}
"""


In [None]:

# get historical market data
msft_hist = msft.history(period="max")
print(msft_hist)
"""
returns:
              Open    High    Low    Close      Volume  Dividends  Splits
Date
1986-03-13    0.06    0.07    0.06    0.07  1031788800        0.0     0.0
1986-03-14    0.07    0.07    0.07    0.07   308160000        0.0     0.0
...
2019-04-15  120.94  121.58  120.57  121.05    15792600        0.0     0.0
2019-04-16  121.64  121.65  120.10  120.77    14059700        0.0     0.0
"""


In [None]:
# show actions (dividends, splits)
print(msft.actions)
"""
returns:
            Dividends  Splits
Date
1987-09-21       0.00     2.0
1990-04-16       0.00     2.0
...
2018-11-14       0.46     0.0
2019-02-20       0.46     0.0
"""

In [None]:

# show dividends
print(msft.dividends)
"""
returns:
Date
2003-02-19    0.08
2003-10-15    0.16
...
2018-11-14    0.46
2019-02-20    0.46
"""

In [None]:
# show splits
print(msft.splits)
"""
returns:
Date
1987-09-21    2.0
1990-04-16    2.0
...
1999-03-29    2.0
2003-02-18    2.0
"""

In [None]:
msft_hist.head()

In [None]:
msft_hist.tail()

In [None]:
df=msft_hist
# df=msft_hist.reset_index(drop=True)
# data['Date']=pd.to_datetime(data['Date'])
print(len(df))
df.head()

Ingest data into darts timeseries


In [None]:
# Count rows without values
# https://stackoverflow.com/questions/28199524/best-way-to-count-the-number-of-rows-with-missing-values-in-a-pandas-dataframe
df.shape[0] - df.dropna().shape[0]



In [None]:
# count any cells without values
df.isnull().values.ravel().sum()

Convert data to timeseries format that models can work with

Regular time intervals between data points and no missing values

In [None]:
import pandas as pd
from darts import TimeSeries

# df = pd.read_csv('https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv')

series = TimeSeries.from_dataframe(df, value_cols=['Close'], fill_missing_dates=True, freq='B') # , "Open", "High", "Low", "Close", "Volume", "Dividends", "Splits", "Date")


In [None]:
# get regularized time series with NaN fill-ins
reg_df = series.pd_dataframe()

In [None]:
# get number of rows without values
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# fill in missing values
reg_df = reg_df.interpolate()


In [None]:
# Check again number of rows without values. Should be 0.
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# update series with new regularized dates and values
series = TimeSeries.from_dataframe(reg_df)

Save prepared timeseries data to local csv for model training

In [None]:
series.to_csv('data/market_data.csv')

Make sure data can load back into timeseries

In [None]:
series = TimeSeries.from_csv('data/market_data.csv', time_col='Date')

In [None]:
series