# Import a dataset to REALLOCATE CKAN

In [1]:
from ckanapi import RemoteCKAN
from dotenv import load_dotenv
import os
import pandas as pd
from pathlib import Path
import requests
import io

  import pkg_resources


In [2]:
# ⚙️ Step 2: Set up connection and metadata
DATASET_NAME = "streets_accessibility"
LOCATION_FILES = "../data/"
CSV_FILE = f"{LOCATION_FILES}{DATASET_NAME}.csv"
PARQUET_FILE = f"{LOCATION_FILES}{DATASET_NAME}.parquet"

DATASET_TITLE = "Walking trips"
DATASET_DESCRIPTION = "Walking trips by year from the EMEF data"

CKAN_URL = "https://reallocate-ckan.iti.gr"
API_KEY = os.getenv("REALLOCATE_KEY")
ckan = RemoteCKAN(CKAN_URL, apikey=API_KEY)
ORG_INFO = ckan.action.organization_show(id="bsc")

# Get dataset metadata
dataset = ckan.action.package_show(id=DATASET_NAME)

## Load data

In [3]:
# 🔍 3. Locate CSV and Parquet file resource URLs

csv_url = None
parquet_url = None

for res in dataset["resources"]:
    fmt = res["format"].lower()
    if fmt == "csv" and not csv_url:
        csv_url = res["url"]
    elif fmt == "parquet" and not parquet_url:
        parquet_url = res["url"]

print("✅ Found CSV:", bool(csv_url))
print("✅ Found Parquet:", bool(parquet_url))


✅ Found CSV: True
✅ Found Parquet: True


In [5]:
if csv_url:
    response = requests.get(csv_url, headers={"Authorization": API_KEY})
    if response.status_code == 200:
        df_csv = pd.read_csv(io.BytesIO(response.content))
        print("✅ CSV loaded from private resource")
        display(df_csv.head())
    else:
        print(f"❌ Failed to download CSV: {response.status_code}")
else:
    print("❌ CSV URL not found")


✅ CSV loaded from private resource


Unnamed: 0,Gis_ID,N_Gis,Gis_X,Gis_Y,Data d'Alta,Data de Baixa,Situació,Barri,Districte,Tipus,Quantitat,Mesura
0,575,401,431791.4225,4583412.0,18/4/2018 0:00:00,,Cruïlla,05. el Fort Pienc,02. Eixample,,,
1,60281,56603,431328.907,4586397.0,24/10/2018 0:00:00,,Cruïlla,62. el Congres i els Indians,09. Sant Andreu,,,
2,68500,64461,431300.697,4585026.0,23/11/2018 0:00:00,,Cruïlla,64. el Camp de l'Arpa del Clot,10. Sant Marti,,,
3,6507,5687,431419.1947,4583669.0,05/04/2018 0:00,,Cruïlla,06. la Sagrada Familia,02. Eixample,Falta element de protecció en gual,,
4,140993,131118,428651.1575,4580252.0,04/03/2021 0:00,,Tram,14. la Font de la Guatlla,03. Sants-Montjuic,,,250.0


In [4]:
if parquet_url:
    response = requests.get(parquet_url, headers={"Authorization": API_KEY})
    if response.status_code == 200:
        df_parquet = pd.read_parquet(io.BytesIO(response.content))
        print("✅ Parquet loaded from private resource")
        display(df_parquet.head())
    else:
        print(f"❌ Failed to download Parquet: {response.status_code}")
else:
    print("❌ Parquet URL not found")


✅ Parquet loaded from private resource


Unnamed: 0,Gis_ID,N_Gis,Gis_X,Gis_Y,Data d'Alta,Data de Baixa,Situació,Barri,Districte,Tipus,Quantitat,Mesura
0,575,401,431791.4225,4583412.0,18/4/2018 0:00:00,,Cruïlla,05. el Fort Pienc,02. Eixample,,,
1,60281,56603,431328.907,4586397.0,24/10/2018 0:00:00,,Cruïlla,62. el Congres i els Indians,09. Sant Andreu,,,
2,68500,64461,431300.697,4585026.0,23/11/2018 0:00:00,,Cruïlla,64. el Camp de l'Arpa del Clot,10. Sant Marti,,,
3,6507,5687,431419.1947,4583669.0,05/04/2018 0:00,,Cruïlla,06. la Sagrada Familia,02. Eixample,Falta element de protecció en gual,,
4,140993,131118,428651.1575,4580252.0,04/03/2021 0:00,,Tram,14. la Font de la Guatlla,03. Sants-Montjuic,,,250.0


## Clean dataset

In [6]:
# 🧹 Step 4 (Optional): Clean your dataset

# Example: clean datetime
if 'Dim-00:TEMPS' in df.columns:
    df['Dim-00:TEMPS'] = pd.to_datetime(df['Dim-00:TEMPS'], errors='coerce') \
                            .dt.strftime('%Y-%m-%dT%H:%M:%S')

# Ensure numeric columns are properly parsed
for col in df.columns:
    if col.upper() == "VALUE":
        df[col] = pd.to_numeric(df[col], errors='coerce')


## Create/update dataset

In [49]:
dataset = get_or_create_dataset(
    ckan,
    dataset_name=DATASET_NAME,
    title=DATASET_TITLE,
    notes=DATASET_DESCRIPTION,
    org_id=ORG_INFO["id"]  # optional
)


✅ Dataset 'walking_trips' already exists.


## Upload dataset

In [50]:
# ✅ Add data to CKAN: upload file and push to DataStore

# 📁 1. Upload file as resource (preserves original format)
resource = ckan.action.resource_create(
    package_id=dataset["id"],
    name=DATASET_TITLE,
    format=Path(CSV_FILE).suffix[1:].upper(),  # auto-detect format
    upload=open(CSV_FILE, "rb")
)
print(f"✅ File uploaded as resource: {resource['id']}")

# 🧠 2. Upload structured data to DataStore
# Load into DataFrame (already done in your notebook as `df`)

# Create CKAN field definitions from df types
fields = []
for col in df.columns:
    dtype = df[col].dtype
    if pd.api.types.is_numeric_dtype(dtype):
        ftype = "numeric"
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        ftype = "timestamp"
    else:
        ftype = "text"
    fields.append({"id": col, "type": ftype})

# Convert to CKAN records format
records = df.to_dict(orient='records')

# Upload to DataStore (overwrites if already exists)
ckan.action.datastore_create(
    resource_id=resource["id"],
    fields=fields,
    records=records,
    force=True
)
print(f"✅ Data uploaded to DataStore: {len(records)} records")

✅ File uploaded as resource: b47929c3-a21a-4153-811a-2f572649e9a4
✅ Data uploaded to DataStore: 18 records


In [51]:
# ✅ Upload Parquet version of the data to CKAN (as file, not parsed)

parquet_resource = ckan.action.resource_create(
    package_id=dataset["id"],
    name=f"{DATASET_TITLE} (parquet)",
    format="Parquet",
    upload=open(PARQUET_FILE, "rb")
)

print(f"✅ Parquet file uploaded as resource: {parquet_resource['id']}")


✅ Parquet file uploaded as resource: 6ced2de3-899b-4787-b739-b60f7f552c2d
