In [1]:
import pandas as pd
import requests

In [2]:
backups = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_backups.csv")
datasets = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_datasets.csv")

In [3]:
datasets.columns = datasets.columns.str.lower()
datasets = datasets.fillna('')
datasets.head()

Unnamed: 0,dataset,notes,dataset_id,url,websites,organization,agency,last_modified
0,Billion-Dollar Weather and Climate Disasters,,1,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,2025-02-10
1,American Communities Survey (ACS),,3,https://www.census.gov/programs-surveys/acs,census.gov,Census Bureau,Department of Commerce,2025-03-03
2,BLS Downloads,,6,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-10
3,CDC FTP,,7,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-10
4,US Census Bureau FTP,,8,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-11


In [4]:
agencies = datasets.agency.value_counts().keys()
categories = categories = ['arts-culture-history','health-human-services',
'budget-finance','parks-recreation','economy','planning-zoning',
'education','public-safety','elections-politics','real-estate-land-records',
'environment','transportation','food','uncategorized']


In [5]:
agency_to_category = {
    'Department of Health and Human Services': 'Health / Human Services',
    'Department of Commerce': 'Economy',
    'Department of Housing and Urban Development': 'Real Estate / Land Records',
    'Department of Veterans Affairs': 'Health / Human Services',
    'National Endowment for the Humanities': 'Arts / Culture / History',
    'AmeriCorps': 'Public Safety',
    'Department of Education': 'Education',
    'Federal Mediation and Conciliation Service': 'Economy',
    'Department of Homeland Security': 'Public Safety',
    'Department of Energy': 'Environment',
    'National Labor Relations Board': 'Economy',
    'Environmental Protection Agency': 'Environment',
    'Consumer Financial Protection Bureau': 'Budget / Finance',
    'Federal Housing Finance Agency': 'Real Estate / Land Records',
    'Department of the Treasury': 'Budget / Finance',
    'Institute of Museum and Library Services': 'Arts / Culture / History',
    'Department of the Interior': 'Parks / Recreation',
    'General Services Administration': 'Economy',
    'Department of Labor': 'Economy',
    'U.S. Agency for International Development': 'Health / Human Services',
    'Department of Transportation': 'Transportation',
    'National Aeronautics and Space Administration': 'Environment',
    '': 'Uncategorized',
    'Department of Justice': 'Public Safety',
    'Department of the Interior, National Parks Service': 'Parks / Recreation',
    'Department of State': 'Elections / Politics',
    'National Science Foundation': 'Education',
    'Department of Health and Human Services, Department of Commerce': 'Health / Human Services',
    'Consumer Financial Protection Bureau, Federal Housing Finance Agency': 'Budget / Finance',
    'U.S. Department of Agriculture': 'Food',
    'Office of Management and Budget': 'Budget / Finance'
}

In [6]:
backups.columns = backups.columns.str.lower()
backups = backups.fillna('')
backups.head()

Unnamed: 0,dataset,dataset_id,status,url,source_website,organization,agency,download_date,size,maintainer,download_location,file_type,notes,metadata_available,metadata_url
0,Billion-Dollar Weather and Climate Disasters,1,Finished,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,2025-02-10,0.15,HD,https://dataverse.harvard.edu/dataset.xhtml?pe...,ZIP,,yes,https://dataverse.harvard.edu/dataset.xhtml?pe...
1,BLS Downloads,6,Finished,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-01,47.0,DRP,,,,,
2,CDC FTP,7,Finished,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-01,213.0,DRP,,,,,
3,US Census Bureau FTP,8,Finished,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-01,180.0,DRP,,,"Partial download, server is back online but co...",,
4,National Hurricane Center (NHC),9,Finished,https://www.nhc.noaa.gov/archive,nhc.noaa.gov,NOAA/National Hurricane Center,Department of Commerce,2025-02-06,61.0,DRP,,,,,


In [11]:
import re
def slugify(string):
    string = clean_text(string)
    # Remove special characters
    string = re.sub(r'[^\w\s-]', '', string)
    # Replace spaces with hyphens
    string = re.sub(r'\s+', '-', string)
    # Convert to lowercase
    string = string.lower()
    return string

def clean_text(string):
    # Remove URL prefixes like http:// or https://
    # string = re.sub(r'http[s]?://', '', string)
    # Remove escape strings like \n
    string = string.replace('\n', '').replace('\r', '').replace('\t', '')
    # Remove leading '-'
    string = re.sub(r'^-', '', string)
    # Replace ':' with '-'
    string = string.replace(':', '')
    return string

def get_dataset_categories(agency):
    return agency_to_category[agency]

In [8]:
def get_metadata_availability(dataset_id):
    """
    This function checks the metadata availability for dataset_id 432 in the backups dataframe.
    It returns "Yes" if metadata is available, "Under Review" if it needs review, and "No" otherwise.
    """
    md_avl = backups[backups.dataset_id == dataset_id].metadata_available.values
    if "yes" in md_avl:

        return "Yes",backups[backups.dataset_id == dataset_id].metadata_url.values[0]
    elif "needs review" in md_avl:
        return "Under Review",""
    else:
        return "No",""

def create_dataset_md(row):
    if row['organization'] == '':
      row['organization'] = 'Unknown'
    ## Defining the schema, filename and path
    schema = 'data_rescue_project'
    dataset_filename = slugify(row['dataset'])
    dataset_path = "_datasets"
    org_filename = slugify(row['organization'])
    org_path = "_organizations"

    ## Get backups for each dataset
    data_backups = backups[backups.dataset == row['dataset']]
    metadata_available, metadata_url = get_metadata_availability(row['dataset_id'])
    ## Creating the dataset markdown file
    ## Dataset-level information
    dataset_md = "---\n"
    dataset_md += f"schema: {schema} \n"
    dataset_md += f"title: {clean_text(row['dataset'])}\n"
    dataset_md += f"organization: {clean_text(row['organization'])}\n"
    dataset_md += f"agency: {clean_text(row['agency'])}\n"
    dataset_md += f"websites: {row['websites']}\n"
    dataset_md += f"data_source: {row['url']}\n"
    dataset_md += f"description: {clean_text(row['notes'])}\n"
    dataset_md += f"last_modified: {row['last_modified']}\n"
    ## Check if any backups have metadata available and populate
    dataset_md += f"metadata_available: {metadata_available}\n"
    dataset_md += f"metadata_url: {metadata_url}\n"
    dataset_md += f"category:\n"
    dataset_md += f"  - {get_dataset_category(clean_text(row['agency']))}\n"

    dataset_md += f"resources:\n"
    ## Resource-level information
    for index, backup_row in data_backups.iterrows():
      dataset_md += f"  - id: {index}\n"
      dataset_md += f"    url: {backup_row['download_location']}\n"
      dataset_md += f"    format: {clean_text(backup_row['file_type'])}\n"
      dataset_md += f"    status: {clean_text(backup_row['status'])}\n"
      dataset_md += f"    size: {backup_row['size']}\n"
      dataset_md += f"    download_date: {backup_row['download_date']}\n"
      dataset_md += f"    maintainer: {clean_text(backup_row['maintainer'])}\n"
      dataset_md += f"    notes: {clean_text(backup_row['notes'])}\n"
    dataset_md += "---\n"
      
    ## Writing the dataset markdown file
    with open(f'{dataset_path}/{dataset_filename}.md', 'w') as output:
      output.write(dataset_md)
    
    ## Creating the organization markdown file
    org_md = "---\n"
    org_md += f"title: {clean_text(row['organization'])} \n" 
    org_md += f"description: \n" 
    org_md += "---\n"

    ## Writing the organization markdown file
    with open(f'{org_path}/{org_filename}.md', 'w') as output:
      output.write(org_md)

In [9]:
datasets.shape

(731, 8)

In [13]:
datasets.apply(create_dataset_md, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
726    None
727    None
728    None
729    None
730    None
Length: 731, dtype: object

In [10]:
import os
def remove_files_os(dir_path):
    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)


In [5]:
a = []
if not a:
    print("test")

test


In [None]:
def replace_multiple_spaces(string):
    return re.sub(r'\s+', ' ', string)

# Example usage
example_string = "This   is  a   string    with multiple   spaces."
cleaned_string = replace_multiple_spaces(example_string)
print(cleaned_string)

In [11]:
remove_files_os('_datasets')

In [8]:
def clean_text(string):
    # Remove URL prefixes like http:// or https://
    # string = re.sub(r'http[s]?://', '', string)
    # Remove escape strings like \n
    string = string.replace('\n', '').replace('\r', '').replace('\t', '')
    # Remove multiple spaces
    string = re.sub(r'\s+', ' ', string)
    # Remove leading spl. characters
    string = re.sub(r'^[^a-zA-Z]+', '', string)
    # string = string.lstrip(',')
    string = re.sub(r'^-', '', string)
    # Remove leading and trailing ':'
    string = string.rstrip(':')
    string = re.sub(r'(?<!http)(?<!https):', '', string)
    
    return string

In [9]:
test_string = ",https://www.ncei.noaa.gov/metadata/geoportal/rest/metadata/item/gov.noaa.ncdcC01557/html#"
clean_text(test_string)

'https://www.ncei.noaa.gov/metadata/geoportal/rest/metadata/item/gov.noaa.ncdcC01557/html#'

In [None]:
test_string = test_string.lstrip(',')
print(test_string)

In [3]:
%load_ext autoreload
%autoreload 2

from create_markdowns import *
import os

In [4]:
def remove_files_os(dir_path):
    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)
            
# Remove files in _datasets and _organizations
remove_files_os('../_datasets')
remove_files_os('../_organizations')
remove_files_os('../_dataset_categories')

create_markdowns()

FileNotFoundError: [Errno 2] No such file or directory: './_datasets'

In [5]:
organizations = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_organizations.csv")
print(organizations[organizations['Organizations'] == 'National Oceanic and Atmospheric Administration']['Categories'].str.split(';').values[0])
print(organizations[organizations['Organizations'] == 'Department of the Interior']['Categories'].str.split(';').values[0])

['Climate & Environment']
['Climate & Environment', 'Humanitarian & Disaster Relief']


In [6]:
organizations

Unnamed: 0,Organizations,Categories
0,American Battle Monuments Commission,Arts & Culture
1,Barry Goldwater Scholarship and Excellence in ...,Education
2,Consumer Financial Protection Bureau,Business & Economy
3,Delta Regional Authority,Business & Economy
4,Denali Commission,Business & Economy;Infrastructure
...,...,...
416,Federal Mediation and Conciliation Service,Labor & Employment
417,Health Resources and Services Administration,Health & Healthcare;Social Services
418,Institute of International Education,Education
419,U.S. Patent and Trademark Office,Business & Economy


In [None]:
combined_array = organizations.values.flatten()
print(combined_array)

In [69]:
backups = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_backups.csv")
datasets = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_datasets.csv")
organizations = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_organizations.csv")

backups.columns = backups.columns.str.lower()
backups = backups.fillna('')
backups.head()

datasets.columns = datasets.columns.str.lower()
datasets = datasets.fillna('')
datasets.head()

organizations = organizations.fillna('')

In [72]:
datasets = datasets[datasets['dataset'].str.contains("Environmental Justice")]

In [76]:
create_dataset_md(datasets.loc[54],backups, organizations)

In [82]:
row = datasets.loc[54]

In [83]:
if row['organization'] == '':
    row['organization'] = 'Unknown'
# Defining the schema, filename and path
schema = 'data_rescue_project'
dataset_filename = slugify(row['dataset'])
dataset_path = "../_datasets"
org_filename = slugify(row['organization'])
org_path = "../_organizations"

# Get backups for each dataset
data_backups = backups[backups.dataset == row['dataset']]
metadata_available, metadata_url = get_metadata_availability(row['dataset_id'], data_backups)
# Creating the dataset markdown file
# Dataset-level information
dataset_md = "---\n"
dataset_md += f"schema: {schema} \n"
dataset_md += f"title: {clean_text(row['dataset'])}\n"
dataset_md += f"organization: {clean_text(row['organization'])}\n"
dataset_md += f"agency: {clean_text(row['agency'])}\n"
dataset_md += f"websites: {clean_text(row['websites'])}\n"
dataset_md += f"data_source: {clean_text(row['url'])}\n"
dataset_md += f"description: {clean_text(row['notes'])}\n"
dataset_md += f"last_modified: {row['last_modified']}\n"
# Check if any backups have metadata available and populate
dataset_md += f"metadata_available: {metadata_available}\n"
dataset_md += f"metadata_url: {clean_text(metadata_url)}\n"
dataset_md += "category:\n"
cats = get_dataset_category(row, organizations)

for cat in cats:
    dataset_md += f"  - {cat} \n"
    
dataset_md += "resources:\n"
# Resource-level information
for index, backup_row in data_backups.iterrows():
    dataset_md += f"  - id: {index}\n"
    dataset_md += f"    url: {clean_text(backup_row['download_location'])}\n"
    dataset_md += f"    format: {clean_text(backup_row['file_type'])}\n"
    dataset_md += f"    status: {clean_text(backup_row['status'])}\n"
    dataset_md += f"    size: {backup_row['size']}\n"
    dataset_md += f"    download_date: {backup_row['download_date']}\n"
    dataset_md += f"    maintainer: {clean_text(backup_row['maintainer'])}\n"
    dataset_md += f"    notes: {clean_text(backup_row['notes'])}\n"
dataset_md += "---\n"
    
# Writing the dataset markdown file
with open(f'{dataset_path}/{dataset_filename}.md', 'w') as output:
    output.write(dataset_md)

# Creating the organization markdown file
org_md = "---\n"
org_md += f"title: {clean_text(row['organization'])} \n" 
org_md += "description: \n" 
org_md += "---\n"

# Writing the organization markdown file
with open(f'{org_path}/{org_filename}.md', 'w') as output:
    output.write(org_md)

In [79]:
def get_dataset_category(row, organizations):
    # Check if dataset has category override
    categories = eval(row['categories'])
    if categories:
        cats = [a['value'] for a in categories]
    # Check if we don't have organization info
    elif row['organization'] == 'Unknown':
        cats = ['Uncategorized']
    else:
        # Get categories from organization
        cats_from_org = organizations[organizations['Organizations'] == row['organization']]['Categories'].values
        cats = []
        [cats.extend(v.split(';')) for v in cats_from_org]      
        cats = list(set(cats))
        if cats == ['']:
            cats = ['Uncategorized']
        else:
            cats = [cat for cat in cats if cat != '']
    
    return cats

In [80]:
get_dataset_category(datasets.loc[54], organizations)

['Climate & Environment', 'Health & Healthcare']

In [59]:
datasets = datasets[datasets['dataset'].str.startswith('20')]

In [60]:
datasets.apply(create_dataset_md, axis=1, args=(backups, organizations))

393    None
395    None
396    None
397    None
398    None
399    None
400    None
401    None
403    None
404    None
405    None
407    None
408    None
409    None
410    None
416    None
640    None
662    None
663    None
665    None
666    None
667    None
696    None
697    None
698    None
798    None
799    None
dtype: object

In [None]:
def get_dataset_category(row):
    # Check if dataset has category override
    categories = eval(row['categories'])
    if categories:
        cats = [a['value'] for a in categories]
    # Check if we don't have organization info
    elif row['organization'] == 'Unknown':
        cats = ['Uncategorized']
    else:
        # Get categories from organization
        cats_from_org = organizations[organizations['Organizations'] == row['organization']]['Categories'].values
        cats = []
        [cats.extend(v.split(';')) for v in cats_from_org]      
        cats = list(set(cats))
        if cats == ['']:
            cats = ['Uncategorized']
        else:
            cats = [cat for cat in cats if cat != '']
        

[None, None]

In [51]:
a = ''
len(a.split(';'))

1

In [9]:
import numpy as np
organizations[organizations['Categories']== np.nan]

Unnamed: 0,Organizations,Categories


In [4]:
BASEROW_ACCESS_TOKEN = 'rS0kZr4TRauacDsiObLy2Zly512HVd3S'

def stringify_arr_vals(arr):
    return ';'.join([i['value'] for i in arr])

def get_results_json(url):
    table = requests.get(
        url,
        headers={
            "Authorization": f"Token {BASEROW_ACCESS_TOKEN}"
        }
    )

    res = table.json()['results']
    if table.json()['next'] is not None:
        res.extend(get_results_json(table.json()['next']))

    return res

# categories = pd.DataFrame(get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/732/?user_field_names=true"))[['Name', 'Active']]
# organizations = pd.DataFrame(get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/638/?user_field_names=true"))[['Organizations', 'Categories']]
# organizations['Categories'] = organizations['Categories'].apply(lambda x: stringify_arr_vals(x))
# categories.to_csv("baserow_exports/datarescue_categories.csv", index=False)
# organizations.to_csv("baserow_exports/datarescue_organizations.csv", index=False)

In [8]:
# agencies = pd.DataFrame(get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/645/?user_field_names=true"))[['Name']]
agencies.to_csv("../baserow_exports/datarescue_agencies.csv", index=False)

In [13]:
def create_agency_md(row):
    """
    This function creates a markdown file for each agency.
    """
    agency_path = "../_agencies"
    agency_filename = slugify(row['Name'])

    # Creating the agency markdown file
    agency_md = "---\n"
    agency_md += f"title: {clean_text(row['Name'])} \n"
    agency_md += "description: \n"
    agency_md += "---\n"

    # Writing the agency markdown file
    with open(f'{agency_path}/{agency_filename}.md', 'w') as output:
        output.write(agency_md)
      

In [14]:
agencies.apply(create_agency_md, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
72    None
73    None
74    None
75    None
76    None
Length: 77, dtype: object