### Imports and configuration

In [1]:
# setup variables

import os
import json
import boto3
import tqdm
from s2orc.config import S3_BUCKET_NAME, CURRENT_VERSION
from s2orc.api.s3_utils import download_from_s3

LOCAL_S2ORC_DIR = 's2orc-data'
s3_manifest_file = f'{CURRENT_VERSION}/manifest.json'
local_manifest_file = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'manifest.json')


s3 = boto3.resource('s3')
bucket = s3.Bucket(S3_BUCKET_NAME)


### corpus downloads

In [2]:
### download manifest file
download_from_s3(bucket, s3_manifest_file, local_manifest_file)


In [3]:
### download corpus files
start = 0
span = 1

# read manifest file
with open(local_manifest_file, 'r') as f:
    manifest = json.load(f)

entries_to_download = sorted(manifest['files'], key=lambda x: x['seq_num'])[start:(start + span)]
print(f"downloading {len(entries_to_download)} files...")
for file_entry in tqdm.tqdm(entries_to_download):
    local_s2orc_file = os.path.join(LOCAL_S2ORC_DIR, file_entry['filename'])
    download_from_s3(bucket, file_entry['filename'], local_s2orc_file)

  0%|          | 0/1 [00:00<?, ?it/s]

downloading 1 files...


100%|██████████| 1/1 [02:53<00:00, 173.91s/it]


### additional downloads

In [4]:

# download metadata
start = 0
span = 10

entries_to_download = sorted(manifest['files'], key=lambda x: x['seq_num'])[start:(start + span)]
print(f"downloading {len(entries_to_download)} metadata...")
for file_entry in tqdm.tqdm(entries_to_download):
    local_metadata_file = os.path.join(LOCAL_S2ORC_DIR, file_entry['metadata'])
    download_from_s3(bucket, file_entry['metadata'], local_metadata_file)

 10%|█         | 1/10 [04:46<42:55, 286.14s/it]


KeyboardInterrupt: 