# Retrieval

## Requirements


Dont want provoke dynamic type checker


Must be downloaded by hand

In [None]:
import os
import subprocess
import requests

from local_nbutils.cfg import CFG

In [None]:
padding_length = max([len(k) for k in CFG])

print("CFG Dictionary:")
for k, v in CFG.items():
    print(f"{k:{padding_length}} : {v}")

# Configs

Next, let us specify all required configurations to have them in one place.

In [None]:
URL = "https://davidmegginson.github.io/ourairports-data/airports.csv"
AIRPORTS_DATA_PATH = CFG["AIRPORTS_DATA_PATH"]

Download of airport data

In [None]:
response = requests.get(URL)
response.raise_for_status()

with open(AIRPORTS_DATA_PATH, "wb") as f:
    f.write(response.content)

# Data Retrieval (GitHub Version)

Let us mention the way if there you need your ssh key!

For completeness, let us briefly sketch how the data would be downloaded (but as csv file) from [kaggle](https://www.kaggle.com). To that end, kaggle API credentials are required (such credential can be created on <https://www.kaggle.com/settings>).

We need the following functionality:

In [None]:
import os
import shutil
import subprocess
import tempfile

Let us specify which dataset we concretely want to download and where we want to put it:

In [None]:
# Path to root directory of the repo
root_dir_ = subprocess.check_output(
    ["git", "rev-parse", "--show-toplevel"],
    text=True,
)
ROOT_DIR = root_dir_.strip()
# Path to data directory
DATA_DIR = os.path.join(ROOT_DIR, "data")
# Paths to which dataframe will be saved
DF_CSV_PATH = os.path.join(DATA_DIR, "df.csv") 

REPO_URL = "git@github.com:neuefische/ds-diabetes-challenge.git"
CSV_PATH_REL = "data/titanic.csv"

Note that the referenced repository is not public. You need to authenticate with ssh!

In [None]:
with tempfile.TemporaryDirectory() as tmpdir: 
    subprocess.run([
        "git", "clone",
        # Clones shallowly.
        "--depth", "1",
        # Skips blobs initially.
        "--filter=blob:none",
        # Enables sparse checkout mode 
        "--sparse", 
        REPO_URL, tmpdir
    ], check=True)
    
    subprocess.run([
        # Runs git commands in the cloned repo directory.
        "git", "-C", tmpdir, 
        # Initialises sparse checkout.
        "sparse-checkout", "init",
    ], check=True)
    
    subprocess.run([
        # Runs git commands in the cloned repo directory.
        "git", "-C", tmpdir,
        # Specifies which files to include in the sparse checkout.
        "sparse-checkout", "set", CSV_PATH_REL,
        # Relaxes the checks as sparse-checkout expects directories.
        "--skip-checks", 
    ], check=True)
    
    src_file = os.path.join(tmpdir, CSV_PATH_REL)
    shutil.copy2(src_file, DF_CSV_PATH)