# dataloader

> Classes that facilitate retrieval of datasets generated by other modules in the package

In [None]:
#| default_exp dataloader

In [None]:
#| export
from typing import List
import requests
from io import StringIO
from importlib import import_module

import pandas as pd

from finsets import metadata 
from finsets.storage import BaseStorage, LocalStorage
from finsets import PACKAGE_DIR

In [None]:
TEST_URL = 'https://www.dropbox.com/s/96xo9f1twlu3525/firmquarter_2022q1.csv?raw=1'

In [None]:
#| export 
def fetch(finsets_func: str, # Which finsets function creates the dataset we want; must include full path to function (e.g. 'wrds.compa.clean')
          storage: BaseStorage=None, # Storage object indicating where the data should be retrieved from / saved to
          save: bool=False, # If dataset needs to be downloaded, this specifies if it should be saved to `storage`
          force_fetch: bool=False, # Whether to redownload/reprocess the dataset even if it exists in storage
          extension: str='.pkl', # Extension that will be automatically suffixed to `finsets_func` to generate the dataset name 
          **func_kwargs # Arguments to be passed to `finsets_func`
          ):

    dataset_name = finsets_func + extension

    # Split `finsets_func` into module name and function name
    s = finsets_func.split('.')
    func = s[-1]
    module = ".".join(s[:-1])

    if storage.exists(dataset_name) and (not storage.is_stale(dataset_name)) and (not force_fetch):
        print(f"Dataset {dataset_name} loaded from storage.")
        df = storage.load(dataset_name)
    else:
        print(f"Building {dataset_name} using finsets.{module}.{func}()")
        func = getattr(import_module(f'finsets.{module}'), func)
        df = func(**func_kwargs)
        
        if save: storage.save(df, dataset_name)

    return df

In [None]:
local_data_repo = LocalStorage(PACKAGE_DIR/'data')

In [None]:
#| eval: false
df = fetch('wrds.compa.download', save=True, storage=local_data_repo, obs_limit=10)
df.head(1)

Building wrds.compa.download.pkl using finsets.wrds.compa.download()


Unnamed: 0,permno,permco,iid,gvkey,datadate,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb
0,25881.0,23369.0,1,1000,1970-12-31,32102,,AE.2,1970.0,12.0,...,0.226,3.62,,,0.579,,,6.114,0.763,0.0


In [None]:
#| eval: false
df = fetch('wrds.compa.download', storage=local_data_repo)
df.head(1)

Dataset wrds.compa.download.pkl loaded from storage.


Unnamed: 0,permno,permco,iid,gvkey,datadate,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb
0,25881.0,23369.0,1,1000,1970-12-31,32102,,AE.2,1970.0,12.0,...,0.226,3.62,,,0.579,,,6.114,0.763,0.0


In [None]:
#| eval: false
df = fetch('wrds.compa.clean', save=True, storage=local_data_repo, df=df)
df.head(1)

Building wrds.compa.clean.pkl using finsets.wrds.compa.clean()


Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,iid,gvkey,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10015,1983,1983-12-31,1983-12-31,6398.0,1,1001,165100,723576,AMFD.,1983.0,12.0,...,0.612,1.574,0.0,,0.084,,,0.65,0.689,0.0


In [None]:
#| eval: false
df = fetch('wrds.compa.clean', storage=local_data_repo, df=df)
df.head(1)

Dataset wrds.compa.clean.pkl loaded from storage.


Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,iid,gvkey,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10015,1983,1983-12-31,1983-12-31,6398.0,1,1001,165100,723576,AMFD.,1983.0,12.0,...,0.612,1.574,0.0,,0.084,,,0.65,0.689,0.0


In [None]:
#| eval: false
df = fetch('wrds.compa.clean', storage=local_data_repo, force_fetch=True, vars=['at'], obs_limit=10)
df.head(1)

Building wrds.compa.clean.pkl using finsets.wrds.compa.clean()


Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,iid,gvkey,at
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10015,1983,1983-12-31,1983-12-31,6398.0,1,1001,14.08


In [None]:
#| eval: false
df = fetch('wrds.compa.clean', storage=local_data_repo)
df.head(1)

Dataset wrds.compa.clean.pkl loaded from storage.


Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,iid,gvkey,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10015,1983,1983-12-31,1983-12-31,6398.0,1,1001,165100,723576,AMFD.,1983.0,12.0,...,0.612,1.574,0.0,,0.084,,,0.65,0.689,0.0


In [None]:
#| eval: false
be = fetch('wrds.compa.book_equity',storage=local_data_repo, df=fetch('wrds.compa.clean', storage=local_data_repo))
be.head(1)

Dataset wrds.compa.clean.pkl loaded from storage.
Building wrds.compa.book_equity.pkl using finsets.wrds.compa.book_equity()


Unnamed: 0_level_0,Unnamed: 1_level_0,bookeq,shreq,pref_stock
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10015,1983,7.823,7.823,0.0


In [None]:
#| eval: false
macro = fetch('fred.clean',save=True, storage=local_data_repo)

Building fred.clean.pkl using finsets.fred.clean()


In [None]:
#| eval: false
macro.keys()

dict_keys(['M', 'D', 'Q'])

In [None]:
#| eval: false
macro['M'].tail(2)

Unnamed: 0_level_0,dtdate,yield_3mt,yield_10yt,yield_1yt,yield_aaa,yield_baa,yield_fedf,cpi,cpi_nsa,indprod,unemp_rate,rec_dum,rec_prob,cfnai,sent_mich,exp_inflation,pu_bbd,punews_bbd
Mdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-09,2023-09-01,5.32,4.38,5.44,5.13,6.16,5.33,307.481,307.789,103.6115,3.8,0.0,0.14,0.02,67.9,3.2,110.09229,113.80537
2023-10,2023-10-01,5.34,4.8,5.42,5.61,6.63,5.33,,,,,,,,,,,


In [None]:
#| export 
def batch_download(finsets_functions: List[str], #List of finsets functions that will generate the datasets we want
                   storage: BaseStorage,
                   **kwargs
                   ):

    for dset in finsets_functions:
        fetch(dset, save=True, storage=storage, force_fetch=True, **kwargs) 

In [None]:
#| eval: false
batch_download(['wrds.compa.clean', 'wrds.crspm.clean', 'fred.clean', 'papers.hassan_etal_2019.clean'], 
               storage=local_data_repo)

Building wrds.compa.clean.pkl using finsets.wrds.compa.clean()
Building wrds.crspm.clean.pkl using finsets.wrds.crspm.clean()
Building fred.clean.pkl using finsets.fred.clean()
Building papers.hassan_etal_2019.clean.pkl using finsets.papers.hassan_etal_2019.clean()


In [None]:
#| eval: false
compa = fetch('wrds.compa.clean',storage=local_data_repo)
compa.head(1)

Dataset wrds.compa.clean.pkl loaded from storage.


Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,iid,gvkey,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10000,1986,1986-10-31,1986-10-31,7952.0,1,13007,683916100,,OMFGA,1986.0,10.0,...,0.252,-0.73,0.0,,0.07,,,0.41,,0.0


In [None]:
#| eval: false
crspm = fetch('wrds.crspm.clean',storage=local_data_repo)
crspm.head(1)

Dataset wrds.crspm.clean.pkl loaded from storage.


Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,permco,cusip,retx,shrout,ret,prc,ticker,ncusip,shrcd,exchcd,siccd
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10000,1985-12,1985-12-31,1985-12-31,7952.0,68391610,,,,,,,,,


In [None]:
#| eval: false
hassan_etal = fetch('papers.hassan_etal_2019.clean', storage=local_data_repo)
hassan_etal.head(1)

Dataset papers.hassan_etal_2019.clean.pkl loaded from storage.


Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,gvkey,PRisk,NPRisk,Risk,PSentiment,NPSentiment,Sentiment,PRiskT_economic,PRiskT_environment,PRiskT_trade,PRiskT_institutions,PRiskT_health,PRiskT_security,PRiskT_tax,PRiskT_technology,date_earningscall
permno,Qdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10001,2014Q2,2014-06-30,2014-06-30,12994,94.384437,418.03548,85.83691,3737.9378,10709.557,1688.1259,1472.468,1427.333,153.8314,834.1546,1036.248,1329.046,619.5879,953.8204,16-May-2014


In [None]:
#| export
def get_text_file_from_url (url, #Data at this url must be readable with pandas.read_csv
             nrows: int=None, #Get only the first `nrows` from the file. If None, gets the entire file
             delimiter: str=',',
             **pd_read_csv_kwargs,
    ) -> pd.DataFrame:
    "Gets the first `nrows` from the file found at `url`. Data at `url` must be separated by `delimiter` and be readable by pandas.read_csv"
    
    if nrows is not None:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        lines = []
        for i, line in enumerate(response.iter_lines(decode_unicode=True)):
            if i >= nrows: break
            lines.append(line)
        partial_csv = '\n'.join(lines)

        return pd.read_csv(StringIO(partial_csv), delimiter=delimiter, **pd_read_csv_kwargs)

    return pd.read_csv(url, delimiter=delimiter,  **pd_read_csv_kwargs)


In [None]:
df = get_text_file_from_url(TEST_URL, nrows=5, delimiter='\t')
df

Unnamed: 0,gvkey,date,PRisk,NPRisk,Risk,PSentiment,NPSentiment,Sentiment,PRiskT_economic,PRiskT_environment,...,Covid_Risk,SARS_Exposure,H1N1_Exposure,Zika_Exposure,Ebola_Exposure,Brexit_Exposure,Brexit_Neg_Sentiment,Brexit_Pos_Sentiment,Brexit_Net_Sentiment,Brexit_Risk
0,1004,2002q1,359.55072,2928.6014,168.98235,997.86415,5550.5807,469.39542,9001.563,6331.43,...,0,0,0,0,0,,,,,
1,1004,2002q2,0.0,0.0,0.0,1594.7321,-5656.6074,544.82417,0.0,0.0,...,0,0,0,0,0,,,,,
2,1004,2002q3,0.0,0.0,0.0,49.334494,-17818.418,318.47134,0.0,0.0,...,0,0,0,0,0,,,,,
3,1004,2003q3,0.0,0.0,0.0,2581.9441,81710.483,1314.8283,0.0,0.0,...,0,0,0,0,0,,,,,


In [None]:
df = get_text_file_from_url(TEST_URL, nrows=5, delimiter='\t', usecols=['gvkey','date', 'PRisk'])
df

Unnamed: 0,gvkey,date,PRisk
0,1004,2002q1,359.55072
1,1004,2002q2,0.0
2,1004,2002q3,0.0
3,1004,2003q3,0.0


In [None]:
#| hide 
#| eval: false
import os, glob
for f in glob.glob('../data/*'): os.remove(f)
with open('../data/.gitkeep', 'w') as f: pass 

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()