In [90]:
import pandas as pd
import requests

In [None]:
backups_table = requests.get(
    "https://baserow.datarescueproject.org/api/database/rows/table/639/?user_field_names=true",
    headers={
        "Authorization": "Token "
    }
)

In [138]:
backups_table.json()['results'][0]

{'id': 1,
 'order': '1.00000000000000000000',
 'Name': 'Billion-Dollar Weather and Climate Disasters',
 'Notes': '',
 'URL': 'https://www.ncei.noaa.gov/access/billions/mapping',
 'Websites': [{'id': 48618,
   'value': 'ncei.noaa.gov',
   'order': '496.00000000000000000000'}],
 'Organization': [{'ids': {'database_table_638': 10461,
    'database_table_644': 48618},
   'value': 'National Oceanic and Atmospheric Administration'}],
 'Agency': [{'ids': {'database_table_638': 10461, 'database_table_645': 22},
   'value': 'Department of Commerce'}],
 'Downloads': [{'id': 1,
   'value': 'f9927cab-a7b2-4484-bddb-b247ecb8e7a6',
   'order': '1.00000000000000000000'}],
 'Last modified': '2025-02-10',
 'Last modified by': {'id': 2, 'name': 'Cataloger'}}

In [139]:
def process_dataset_row(d):
    def get_arr_vals(arr):
        return ", ".join([x["value"] for x in arr])
    return {
        "dataset": d["Name"],
        "notes": d["Notes"],
        "dataset_id": d["id"],
        "url": d["URL"],
        "websites": get_arr_vals(d["Websites"]),
        "organization": get_arr_vals(d["Organization"]),
        "agency": get_arr_vals(d["Agency"]),
        "last_modified": d["Last modified"],
        "last_modified_by": d["Last modified by"]["name"]
    }

In [140]:
rows = []
for row in backups_table.json()['results']:
    rows.append(process_dataset_row(row))

dataset_table = pd.DataFrame(rows)

In [None]:
dataset_table.to_csv("datarescue_datasets.csv", index=False)

Unnamed: 0,dataset,notes,dataset_id,url,websites,organization,agency,last_modified,last_modified_by
0,Billion-Dollar Weather and Climate Disasters,,1,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,2025-02-10,Cataloger
1,American Communities Survey (ACS),,3,https://www.census.gov/programs-surveys/acs,census.gov,Census Bureau,Department of Commerce,2025-03-03,Cataloger
2,BLS Downloads,,6,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-10,Admin
3,CDC FTP,,7,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-10,Admin
4,US Census Bureau FTP,,8,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-11,Admin
...,...,...,...,...,...,...,...,...,...
95,Social Wellbeing Report,,102,https://www.imls.gov/research-tools/data-colle...,imls.gov,Institute of Museum and Library Services,Institute of Museum and Library Services,2025-02-21,Daphna
96,IMLS Indicators Workbook: Economic Status and ...,,103,https://www.imls.gov/data/data-catalog/imls-in...,imls.gov,Institute of Museum and Library Services,Institute of Museum and Library Services,2025-02-21,Daphna
97,Library Search & Compare,,104,https://www.imls.gov/search-compare,imls.gov,Institute of Museum and Library Services,Institute of Museum and Library Services,2025-02-21,Daphna
98,PLS Benchmarking Tables,,105,https://www.imls.gov/pls-benchmarking-tables,imls.gov,Institute of Museum and Library Services,Institute of Museum and Library Services,2025-02-21,Daphna


In [None]:
def process_backup_row(d):
    def get_arr_vals(arr):
        return ", ".join([x["value"] for x in arr])
    if d["Metadata Available"]:
        metadata_avl = d["Metadata Available"]["value"]
    else:
        metadata_avl = ""
    return {
        "dataset": d["Dataset"][0]["value"],
        "dataset_id": d["Dataset"][0]["id"],
        "status": d["Status"]["value"],
        "url": d["Dataset URL"][0]["value"],
        "source_website": d["Website"][0]["value"],
        "organization": d["Organization"][0]["value"],
        "agency": d["Agency"][0]["value"],
        "download_date": d["Backup date"],
        "size": d["Backup size"],
        "maintainer": get_arr_vals(d["Maintainer"]),
        "download_location": d["Backup location"],
        "file_type": get_arr_vals(d["File type"]),
        "notes": d["Notes"],
        "metadata_available": metadata_avl,
        "metadata_url": d["Metadata URL"]
    }

In [131]:
rows = []
for row in backups_table.json()['results']:
    rows.append(process_row(row))

In [132]:
baserow_data = pd.DataFrame(rows)

In [136]:
baserow_data.dataset_id.value_counts()

dataset_id
58    4
65    2
59    2
60    2
68    2
     ..
34    1
33    1
32    1
31    1
99    1
Name: count, Length: 93, dtype: int64

In [133]:
baserow_data

Unnamed: 0,dataset,dataset_id,status,url,source_website,organization,agency,download_date,size,maintainer,download_location,file_type,notes,metadata_available,metadata_url
0,Billion-Dollar Weather and Climate Disasters,1,Finished,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,,,HD,https://dataverse.harvard.edu/dataset.xhtml?pe...,,,needs review,
1,BLS Downloads,6,Finished,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-01,47.00,DRP,,,,,
2,CDC FTP,7,Finished,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-01,213.00,DRP,,,,,
3,US Census Bureau FTP,8,Finished,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-01,180.00,DRP,,,"Partial download, server is back online but co...",,
4,National Hurricane Center (NHC),9,Finished,https://www.nhc.noaa.gov/archive,nhc.noaa.gov,NOAA/National Hurricane Center,Department of Commerce,2025-02-06,61.00,DRP,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Mental Health Client-Level Data (MH-CLD),95,Finished,https://www.samhsa.gov/data/data-we-collect/mh...,samsha.gov,Substance Abuse and Mental Health Services Adm...,Department of Health and Human Services,2025-02-09,,"DRP, DL",https://www.datalumos.org/datalumos/project/21...,"ZIP, PDF",,,
96,Drug Abuse Warning Network (DAWN),96,Finished,https://www.samhsa.gov/data/data-we-collect/da...,samsha.gov,Substance Abuse and Mental Health Services Adm...,Department of Health and Human Services,2025-02-09,,"DRP, DL",https://www.datalumos.org/datalumos/project/21...,"ZIP, PDF",,,
97,State Library Administrative Agency Survey (SLAA),97,Finished,https://www.imls.gov/research-evaluation/data-...,imls.gov,Institute of Museum and Library Services,Institute of Museum and Library Services,2025-02-06,,"DRP, DL",https://www.datalumos.org/datalumos/project/21...,"ZIP, PDF","Old repo stops at 2016, new repo stops 2022 an...",,
98,Public Libraries Survey (PLS),98,Finished,https://www.imls.gov/research-evaluation/data-...,imls.gov,Institute of Museum and Library Services,Institute of Museum and Library Services,2025-02-06,,"DRP, DL",https://www.datalumos.org/datalumos/project/21...,"ZIP, PDF","Old repo has 2013-2014 and incorrect metadata,...",,


In [11]:
drp_datasets = pd.read_csv('DataRescueProject_datasets.csv')

In [66]:
drp_datasets.dtypes

dataset               object
status                object
url                   object
source_website        object
organization          object
agency                object
download_date         object
size                  object
maintainer            object
download_location     object
file_type             object
notes                 object
metadata_available      bool
metadata_url          object
dtype: object

In [32]:
drp_datasets.columns = drp_datasets.columns.str.lower()
drp_datasets = drp_datasets.fillna('')
drp_datasets.head()

Unnamed: 0,dataset,status,url,source_website,organization,agency,download_date,size,maintainer,download_location,file_type,notes,metadata_available,metadata_url
0,Billion-Dollar Weather and Climate Disasters,Finished,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,,,,https://dataverse.harvard.edu/dataset.xhtml?pe...,,,False,
1,BLS Downloads,Finished,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-01,47.00GB,DRP,,,,False,
2,CDC FTP,Finished,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-01,213.00GB,DRP,,,,False,
3,US Census Bureau FTP,Finished,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-01,180.00GB,DRP,,,"Partial download, server is back online but co...",False,
4,National Hurricane Center,Finished,https://www.nhc.noaa.gov/archive,nhc.noaa.gov,NOAA/National Hurricane Center,Department of Commerce,2025-02-06,61.00GB,DRP,,,,False,


In [None]:
import re
def slugify(string):
    string = clean_text(string)
    # Remove special characters
    string = re.sub(r'[^\w\s-]', '', string)
    # Replace spaces with hyphens
    string = re.sub(r'\s+', '-', string)
    # Convert to lowercase
    string = string.lower()
    return string

def clean_text(string):
    # Remove leading '-'
    string = re.sub(r'^-', '', string)
    # Replace ':' with '-'
    string = string.replace(':', '-')
    return string

In [84]:
def create_dataset_md(row):
    if row['organization'] == '':
      row['organization'] = 'Unknown'
    ## Defining the schema, filename and path
    schema = 'data_rescue_project'
    dataset_filename = slugify(row['dataset'])
    dataset_path = "../_datasets"
    org_filename = slugify(row['organization'])
    org_path = "../_organizations"

    ## Creating the dataset markdown file
    dataset_md = "---\n"
    dataset_md += f"schema: {schema} \n"
    dataset_md += f"title: {clean_text(row['dataset'])}\n"
    dataset_md += f"organization: {clean_text(row['organization'])}\n"
    dataset_md += f"agency: {clean_text(row['agency'])}\n"
    dataset_md += f"notes: {clean_text(row['notes'])}\n"
    dataset_md += f"status: {clean_text(row['status'])}\n"
    dataset_md += f"size: {clean_text(row['size'])}\n"
    dataset_md += f"maintainer: {clean_text(row['maintainer'])}\n"
    dataset_md += f"download_date: {clean_text(row['download_date'])}\n"
    dataset_md += f"metadata_available: {str(row['metadata_available'])}\n"
    dataset_md += f"metadata_url: {row['metadata_url']}\n"
    dataset_md += f"resources:\n"
    dataset_md += f"  - name: Data Source\n"
    dataset_md += f"    url: {row['url']}\n"
    dataset_md += f"    format: html\n"
    dataset_md += f"  - name: Link to archive\n"
    dataset_md += f"    url: {row['download_location']}\n"
    dataset_md += f"    format: {row['file_type']}\n"
    dataset_md += "---\n"

    ## Writing the dataset markdown file
    with open(f'{dataset_path}/{dataset_filename}.md', 'w') as output:
      output.write(dataset_md)
    
    ## Creating the organization markdown file
    org_md = "---\n"
    org_md += f"title: {clean_text(row['organization'])} \n" 
    org_md += f"description: \n" 
    org_md += "---\n"

    ## Writing the organization markdown file
    with open(f'{org_path}/{org_filename}.md', 'w') as output:
      output.write(org_md)

In [85]:
drp_datasets.apply(create_dataset_md, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
355    None
356    None
357    None
358    None
359    None
Length: 360, dtype: object