<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/opengov_gender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install ckanapi



In [16]:
from ckanapi import RemoteCKAN
import pandas as pd
import os
import zipfile
import yaml
import re
import unicodedata

In [17]:
# Access resource data via a web API
rc = RemoteCKAN('https://open.canada.ca/data/en/')
APIID = "ab457ba4-4a80-476b-8608-c57c52d59bd1"

# Initialize an empty list to store all records
all_records = []

# Set the initial offset and desired limit
offset = 0
limit = 100  # Or any desired number of records per request

while True:
    result = rc.action.datastore_search(
        resource_id=APIID,
        limit=limit,
        offset=offset,
    )

    # Add the retrieved records to the list
    all_records.extend(result['records'])

    # If the number of records returned is less than the limit,
    # it means we've reached the end
    if len(result['records']) < limit:
        break

    # Increment the offset for the next request
    offset += limit

# Create the DataFrame from all the retrieved records
df = pd.DataFrame(all_records)

print(df.count())

_id       3
code      3
tit_en    3
tit_fr    3
en_def    3
fr_def    3
dtype: int64


In [21]:
# Clean up 'id' column
df = df.rename(columns={'_id': 'id'})
idfinal = 'code'

# Add a new 'title' column using the content of another column
col1 = 'tit_en'
df['title'] = df[col1]

In [22]:
# Confirm changes
print(df)

   id code    tit_en         tit_fr  \
0   1    1      Male  Sexe masculin   
1   2    2    Female   Sexe féminin   
2   3    3  Intersex      Intersexe   

                                              en_def  \
0  This category includes persons whose sex assig...   
1  This category includes persons whose sex assig...   
2  This category includes persons whose sex assig...   

                                              fr_def     title  
0  Cette catégorie comprend les personnes dont le...      Male  
1  Cette catégorie comprend les personnes dont le...    Female  
2  Cette catégorie comprend les personnes dont le...  Intersex  


In [23]:
# Save as csv
output = df
output.to_csv('output.csv', index=False)
print(f"DataFrame saved to output.csv")

DataFrame saved to output.csv


In [24]:
# Create a directory to store the Markdown files
output_dir = 'outputmd'
decno = "408.01."
os.makedirs(output_dir, exist_ok=True)

# Iterate through each row of the DataFrame
for index, row in output.iterrows():
    # Create the file name (e.g., row_0.md, row_1.md, etc.)
    file_name = os.path.join(output_dir, decno + f"{row[idfinal]} {row['title']}.md")

    # Convert the row to a dictionary
    row_dict = row.to_dict()

    # Convert the dictionary to YAML format
    yaml_content = yaml.dump(row_dict, default_flow_style=False)

    # Write the YAML content to the file (with .md extension)
    with open(file_name, 'w') as f:
        f.write("---\n" + yaml_content + "\n---")

print(f"Files with YAML content (saved as .md) are in the '{output_dir}' directory.")

# Create a zip file containing the Markdown files
zip_file_name = 'outputmd.zip'
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for root, _, files in os.walk(output_dir):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, output_dir))

print(f"Markdown files zipped to '{zip_file_name}'.")

Files with YAML content (saved as .md) are in the 'outputmd' directory.
Markdown files zipped to 'outputmd.zip'.
