# Import a dataset to REALLOCATE CKAN

In [1]:
from ckanapi import RemoteCKAN
from dotenv import load_dotenv
import os
import pandas as pd
from pathlib import Path


  import pkg_resources


In [None]:
# ‚öôÔ∏è Step 2: Set up connection and metadata
DATASET_NAME = "../data/walking_trips"
CSV_FILE = f"{DATASET_NAME}.csv"
PARQUET_FILE = f"{DATASET_NAME}.parquet"

DATASET_TITLE = "Walking trips"
DATASET_DESCRIPTION = "Walking trips by year from the EMEF data"

CKAN_URL = "https://reallocate-ckan.iti.gr"
API_KEY = os.getenv("REALLOCATE_KEY")
ckan = RemoteCKAN(CKAN_URL, apikey=API_KEY)
ORG_INFO = ckan.action.organization_show(id="bsc")


In [4]:
def get_or_create_dataset(ckan, dataset_name, title=None, notes="", org_id=None):
    """
    Get or create a CKAN dataset by name.
    """
    try:
        dataset = ckan.action.package_show(id=dataset_name)
        print(f"‚úÖ Dataset '{dataset_name}' already exists.")
    except:
        print(f"‚ÑπÔ∏è Dataset '{dataset_name}' not found. Creating it...")
        create_kwargs = {
            "name": dataset_name,
            "title": title or dataset_name,
            "notes": notes,
            "private": True
        }
        if org_id:
            create_kwargs["owner_org"] = org_id
        dataset = ckan.action.package_create(**create_kwargs)
        print(f"‚úÖ Created dataset '{dataset_name}'.")
    return dataset


## Load data

In [5]:
df = pd.read_csv(CSV_FILE)

df.head()

Unnamed: 0,Dim-00:TEMPS,Dim-01:TERRITORI,Dim-01:TERRITORI (order),Dim-01:TERRITORI (type),Dim-02:TIPUS DE ETAPA,VALUE
0,2015-01-01T00:00:00,Barcelona,-1,Municipi,Connexi√≥,80657.0
1,2015-01-01T00:00:00,Barcelona,-1,Municipi,Interna,3280032.0
2,2016-01-01T00:00:00,Barcelona,-1,Municipi,Connexi√≥,58655.0
3,2016-01-01T00:00:00,Barcelona,-1,Municipi,Interna,2385291.0
4,2017-01-01T00:00:00,Barcelona,-1,Municipi,Interna,2287812.0


## Clean dataset

In [6]:
# üßπ Step 4 (Optional): Clean your dataset

# Example: clean datetime
if 'Dim-00:TEMPS' in df.columns:
    df['Dim-00:TEMPS'] = pd.to_datetime(df['Dim-00:TEMPS'], errors='coerce') \
                            .dt.strftime('%Y-%m-%dT%H:%M:%S')

# Ensure numeric columns are properly parsed
for col in df.columns:
    if col.upper() == "VALUE":
        df[col] = pd.to_numeric(df[col], errors='coerce')


## Create/update dataset

In [49]:
dataset = get_or_create_dataset(
    ckan,
    dataset_name=DATASET_NAME,
    title=DATASET_TITLE,
    notes=DATASET_DESCRIPTION,
    org_id=ORG_INFO["id"]  # optional
)


‚úÖ Dataset 'walking_trips' already exists.


## Upload dataset

In [50]:
# ‚úÖ Add data to CKAN: upload file and push to DataStore

# üìÅ 1. Upload file as resource (preserves original format)
resource = ckan.action.resource_create(
    package_id=dataset["id"],
    name=DATASET_TITLE,
    format=Path(CSV_FILE).suffix[1:].upper(),  # auto-detect format
    upload=open(CSV_FILE, "rb")
)
print(f"‚úÖ File uploaded as resource: {resource['id']}")

# üß† 2. Upload structured data to DataStore
# Load into DataFrame (already done in your notebook as `df`)

# Create CKAN field definitions from df types
fields = []
for col in df.columns:
    dtype = df[col].dtype
    if pd.api.types.is_numeric_dtype(dtype):
        ftype = "numeric"
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        ftype = "timestamp"
    else:
        ftype = "text"
    fields.append({"id": col, "type": ftype})

# Convert to CKAN records format
records = df.to_dict(orient='records')

# Upload to DataStore (overwrites if already exists)
ckan.action.datastore_create(
    resource_id=resource["id"],
    fields=fields,
    records=records,
    force=True
)
print(f"‚úÖ Data uploaded to DataStore: {len(records)} records")

‚úÖ File uploaded as resource: b47929c3-a21a-4153-811a-2f572649e9a4
‚úÖ Data uploaded to DataStore: 18 records


In [51]:
# ‚úÖ Upload Parquet version of the data to CKAN (as file, not parsed)

parquet_resource = ckan.action.resource_create(
    package_id=dataset["id"],
    name=f"{DATASET_TITLE} (parquet)",
    format="Parquet",
    upload=open(PARQUET_FILE, "rb")
)

print(f"‚úÖ Parquet file uploaded as resource: {parquet_resource['id']}")


‚úÖ Parquet file uploaded as resource: 6ced2de3-899b-4787-b739-b60f7f552c2d
