# Download and store STOOQ data

This notebook contains information on downloading the STOOQ stock and ETF price data that we use for a long-short strategy using Random Forest return predictions.

## Imports & Settings

In [5]:
!pip install ipython-autotime
!pip install pandas_datareader

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
Collecting pandas_datareader
  Downloading pandas_datareader-0.9.0-py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 7.8 MB/s eta 0:00:01
Collecting lxml
  Downloading lxml-4.6.3-cp37-cp37m-manylinux1_x86_64.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 12.9 MB/s eta 0:00:01
Installing collected packages: lxml, pandas-datareader
Successfully installed lxml-4.6.3 pandas-datareader-0.9.0


In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml

pd.set_option('display.expand_frame_repr', False)

## Set Data Store path

Modify path if you would like to store the data elsewhere and change the notebooks accordingly

In [8]:
%load_ext autotime
### set path
import os
from pathlib import Path
path_home = os.getcwd()
path_home = Path(path_home)
path_data = path_home / 'data'
if not path_data.exists():
    path_data.mkdir()

time: 866 µs (started: 2021-04-28 00:46:02 +00:00)


## Stooq Historical Market Data
##### notebook freezes if the number of asset classes increase

In [9]:
### Download price data
# 1. Download **price data** for the selected combination of asset class, 
#    market and frequency from [the Stooq website](https://stooq.com/db/h/)
# 2. Store the result under `stooq` using the preferred folder structure outlined on the website. 
#    It has the structure: `/data/freq/market/asset_class`, such as `/data/daily/us/nasdaq etfs`.

# temp dir for downloading
stooq_path = path_home / 'stooq'
if not stooq_path.exists():
    stooq_path.mkdir()

STOOQ_URL = 'https://static.stooq.com/db/h/'

market = 'jp'
data_url = f'd_{market}_txt.zip'
response = requests.get(STOOQ_URL + data_url).content
with ZipFile(BytesIO(response)) as zip_file:
    for i, file in enumerate(zip_file.namelist()):
        if not file.endswith('.txt'):
            continue
        local_file = stooq_path / file
        local_file.parent.mkdir(parents=True, exist_ok=True)
        with local_file.open('wb') as output:
            for line in zip_file.open(file).readlines():
                output.write(line)

### Add symbols
# Add the corresponding **symbols**, i.e., tickers and names by following the directory tree on the same site. 
# You can also adapt the following code snippet using the appropriate asset code that you find by inspecting the url
metadata_dict = {
    ('jp', 'tse etfs'): 34,
    ('jp', 'tse stocks'): 32,
}

for (market, asset_class), code in metadata_dict.items():
    df = pd.read_csv(f'https://stooq.com/db/l/?g={code}', sep='        ').apply(lambda x: x.str.strip())
    df.columns = ['ticker', 'name']
    df = df.drop_duplicates('ticker').dropna()
    print(market, asset_class, f'# tickers: {df.shape[0]:,.0f}')
    path = stooq_path / 'tickers' / market
    if not path.exists():
        path.mkdir(parents=True)
    df.to_csv(path / f'{asset_class}.csv', index=False)    
    
    
### Store price data
# It removes files that do not have data or do not appear in the corresponding list of symbols.
# The function `get_stooq_prices_and_symbols` loads data assuming the directory structure described above and takes the following arguments:
# - frequency (see Stooq website for options as these may change; default is `daily`
# - market (default: `us`), and 
#- asset class (default: `nasdaq etfs`.

def get_stooq_prices_and_tickers(frequency='daily',
                                 market='us',
                                 asset_class='nasdaq etfs'):
    prices = []
    
    tickers = (pd.read_csv(stooq_path / 'tickers' / market / f'{asset_class}.csv'))

    if frequency in ['5 min', 'hourly']:
        parse_dates = [['date', 'time']]
        date_label = 'date_time'
    else:
        parse_dates = ['date']
        date_label = 'date'
    names = ['ticker', 'freq', 'date', 'time', 
             'open', 'high', 'low', 'close','volume', 'openint']
    
    usecols = ['ticker', 'open', 'high', 'low', 'close', 'volume'] + parse_dates
    path = stooq_path / 'data' / frequency / market / asset_class
    print(path.as_posix())
    files = path.glob('**/*.txt')
    for i, file in enumerate(files, 1):
        if i % 500 == 0:
            print(i)
        if file.stem not in set(tickers.ticker.str.lower()):
            print(file.stem, 'not available')
            file.unlink()
        else:
            try:
                df = (pd.read_csv(
                    file,
                    names=names,
                    usecols=usecols,
                    header=0,
                    parse_dates=parse_dates))
                prices.append(df)
            except pd.errors.EmptyDataError:
                print('\tdata missing', file.stem)
                file.unlink()

    prices = (pd.concat(prices, ignore_index=True)
              .rename(columns=str.lower)
              .set_index(['ticker', date_label])
              .apply(lambda x: pd.to_numeric(x, errors='coerce')))
    return prices, tickers


### We'll be using Japanese equities. 
# The following code collects the price data for the period 2000-2019 and stores it with the corresponding symbols
market = 'jp'
asset_class = 'tse stocks'
frequency = 'daily'

idx = pd.IndexSlice

print(f'\n{asset_class}')
prices, tickers = get_stooq_prices_and_tickers(frequency=frequency, 
                                               market=market, 
                                               asset_class=asset_class)

prices = prices.sort_index().loc[idx[:, '2000': '2019'], :]
names = prices.index.names
prices = (prices
          .reset_index()
          .drop_duplicates()
          .set_index(names)
          .sort_index())

print('\nNo. of observations per asset')
print(prices.groupby('ticker').size().describe())
key = f'stooq_{market}_{asset_class.replace(" ", "_")}_'

print(prices.info(null_counts=True))
fn = key + 'prices.csv'
prices.to_csv(path_data / fn)

print(tickers.info())
fn = key + 'tickers.csv'
tickers.to_csv(path_data / fn)

        
### rm stooq dir which make notebook freezed
!rm -r stooq

jp tse etfs # tickers: 341
jp tse stocks # tickers: 3,773

tse stocks
/home/jupyter/random_forests/stooq/data/daily/jp/tse stocks
500
1000
1500
2000
2500
3000
3500

No. of observations per asset
count    3623.000000
mean     2805.231852
std      1177.353004
min         1.000000
25%      2147.500000
50%      3041.000000
75%      3621.000000
max      4905.000000
dtype: float64
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10163355 entries, ('1301.JP', Timestamp('2005-03-22 00:00:00')) to ('9997.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
 #   Column  Non-Null Count     Dtype  
---  ------  --------------     -----  
 0   open    10163355 non-null  float64
 1   high    10163355 non-null  float64
 2   low     10163355 non-null  float64
 3   close   10163355 non-null  float64
 4   volume  10163355 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 426.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3773 entries, 0 to 3772
Data columns (t

In [11]:
!zip stooq_jp_data.zip data/*

  adding: data/stooq_jp_tse_stocks_prices.csv (deflated 74%)
  adding: data/stooq_jp_tse_stocks_tickers.csv (deflated 58%)
time: 35.6 s (started: 2021-04-28 01:04:55 +00:00)


In [None]:
!gsutil cp stooq_jp_data.zip gs://finance_data_bucket/lbk/rf/