<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/opengov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
from ckanapi import RemoteCKAN
import pandas as pd
import os
import zipfile
import yaml
import re
import unicodedata

In [106]:
# Access resource data via a web API
rc = RemoteCKAN('https://open.canada.ca/data/en/')
APIID = "bdb33e8c-53ef-4bae-9493-35f343191c02"

# Initialize an empty list to store all records
all_records = []

# Set the initial offset and desired limit
offset = 0
limit = 100  # Or any desired number of records per request

while True:
    result = rc.action.datastore_search(
        resource_id=APIID,
        limit=limit,
        offset=offset,
    )

    # Add the retrieved records to the list
    all_records.extend(result['records'])

    # If the number of records returned is less than the limit,
    # it means we've reached the end
    if len(result['records']) < limit:
        break

    # Increment the offset for the next request
    offset += limit

# Create the DataFrame from all the retrieved records
df = pd.DataFrame(all_records)

# Add a new 'title' column using the content of 'GC_NM_AB_EN'
df['title'] = df['GC_NM_AB_EN']

# Remove French accent characters from the 'title' column
df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('ascii'))

# Clean up 'id' column
df = df.rename(columns={'_id': 'id'})

print(df.count())

id                 380
GC_HIST_ID         380
GC_ID              380
GC_NM_OFF_EN       380
GC_NM_OFF_FR       380
GC_NM_AB_EN        380
GC_NM_AB_FR        380
STAT_CD            380
STAT_DESC_EN       380
STAT_DESC_FR       380
TYPE_CD            380
TYPE_DESC_EN       380
TYPE_DESC_FR       380
RECENT_IND         380
EFF_DT             380
EFF_FN_DT          380
OBSERVATIONS_EN    380
OBSERVATIONS_FR    380
GC_PAR_ID            6
GC_JUR_ID           53
ISO_ALPHA_2_CD     367
ISO_ALPHA_3_CD     368
UN_ONU_CD          363
MODIF_DT           380
title              380
dtype: int64


In [107]:
# Filter out inactive countries
output = df[df['STAT_CD'] == "1"]

print(output.head())

   id GC_HIST_ID  GC_ID                                  GC_NM_OFF_EN  \
0   1    1000110  10001                          the Republic of Fiji   
1   2    1000210  10002  South Georgia and the South Sandwich Islands   
2   3    1000310  10003                the People’s Republic of China   
3   4    1000410  10004                      the Republic of Slovenia   
6   7    1000510  10005                      the Republic of Kiribati   

                                 GC_NM_OFF_FR  \
0                     la République des Fidji   
1  Géorgie du Sud-et-les Îles Sandwich du Sud   
2            la République populaire de Chine   
3                   la République de Slovénie   
6                   la République de Kiribati   

                                    GC_NM_AB_EN  \
0                                          Fiji   
1  South Georgia and the South Sandwich Islands   
2                                         China   
3                                      Slovenia   
6          

In [108]:
# Save as csv
output.to_csv('output.csv', index=False)
print(f"DataFrame saved to output.csv")

DataFrame saved to output.csv


In [111]:
# Create a directory to store the Markdown files
output_dir = 'outputmd'
os.makedirs(output_dir, exist_ok=True)

# Iterate through each row of the DataFrame
for index, row in output.iterrows():
    # Create the file name (e.g., row_0.md, row_1.md, etc.)
    file_name = os.path.join(output_dir, f"408.01.{row['GC_ID']} {row['title']}.md")

    # Convert the row to a dictionary
    row_dict = row.to_dict()

    # Convert the dictionary to YAML format
    yaml_content = yaml.dump(row_dict, default_flow_style=False)

    # Write the YAML content to the file (with .md extension)
    with open(file_name, 'w') as f:
        f.write("---\n" + yaml_content + "\n---")

print(f"Files with YAML content (saved as .md) are in the '{output_dir}' directory.")

# Create a zip file containing the Markdown files
zip_file_name = 'outputmd.zip'
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for root, _, files in os.walk(output_dir):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, output_dir))

print(f"Markdown files zipped to '{zip_file_name}'.")

Files with YAML content (saved as .md) are in the 'outputmd' directory.
Markdown files zipped to 'outputmd.zip'.
