# Import a dataset to REALLOCATE CKAN

In [14]:
from ckanapi import RemoteCKAN
from dotenv import load_dotenv
import os
import pandas as pd
from pathlib import Path
import requests
from io import StringIO

In [16]:
# ⚙️ Step 2: Set up connection and metadata
DATASET_NAME = "walking_trips_1"
LOCATION = "../data/"
CSV_FILE = f"{LOCATION}{DATASET_NAME}.csv"
PARQUET_FILE = f"{LOCATION}{DATASET_NAME}.parquet"

DATASET_TITLE = "Walking trips"
DATASET_DESCRIPTION = "Walking trips by year from the EMEF data"

CKAN_URL = "https://reallocate-ckan.iti.gr"
REALLOCATE_KEY = os.getenv("REALLOCATE_KEY")
API_KEY = os.getenv("API_KEY")

ckan = RemoteCKAN(CKAN_URL, apikey=REALLOCATE_KEY)
ORG_INFO = ckan.action.organization_show(id="bsc")

#Get data
API_URL = "https://portaldades.ajuntament.barcelona.cat/services/backend/rest/statistic/export"
headers = {'X-IBM-Client-Id': API_KEY}
DATASETS = {
    f'{DATASET_NAME}':'wc9hkmubl7',

}


In [3]:
def get_or_create_dataset(ckan, dataset_name, title=None, notes="", org_id=None):
    """
    Get or create a CKAN dataset by name.
    """
    try:
        dataset = ckan.action.package_show(id=dataset_name)
        print(f"✅ Dataset '{dataset_name}' already exists.")
    except:
        print(f"ℹ️ Dataset '{dataset_name}' not found. Creating it...")
        create_kwargs = {
            "name": dataset_name,
            "title": title or dataset_name,
            "notes": notes,
            "private": True
        }
        if org_id:
            create_kwargs["owner_org"] = org_id
        dataset = ckan.action.package_create(**create_kwargs)
        print(f"✅ Created dataset '{dataset_name}'.")
    return dataset


In [4]:
import os

def upload_or_update_resource(ckan, dataset_id, file_path, name, fmt):
    """
    If a resource with this name exists in the dataset, update it.
    Otherwise, create a new one.
    """
    # Try to find an existing resource
    existing_resource = None
    dataset = ckan.action.package_show(id=dataset_id)
    for res in dataset["resources"]:
        if res["name"] == name:
            existing_resource = res
            break

    with open(file_path, "rb") as f:
        upload_data = {
            "name": name,
            "format": fmt,
            "upload": f
        }

        if existing_resource:
            # Update existing resource
            upload_data["id"] = existing_resource["id"]
            res = ckan.action.resource_update(**upload_data)
            print(f"🔄 Updated existing resource: {res['id']}")
        else:
            # Create new resource
            upload_data["package_id"] = dataset_id
            res = ckan.action.resource_create(**upload_data)
            print(f"✅ Created new resource: {res['id']}")

    return res


## Load data

In [17]:
# Constants
DATASET_ID = f"?id={DATASETS[DATASET_NAME]}&fileformat=CSV"

# Fetch CSV content
data_response = requests.get((API_URL + DATASET_ID), headers=headers)

# Manually decode with correct encoding
decoded_text = data_response.content.decode('utf-8')  # or 'cp1252' if needed

# Load into StringIO for pandas
csv_file = StringIO(decoded_text)
df = pd.read_csv(csv_file)

# Preview the data
print(df.head())


           Dim-00:TEMPS Dim-01:TERRITORI  Dim-01:TERRITORI (order)  \
0  2015-01-01T00:00:00Z        Barcelona                        -1   
1  2015-01-01T00:00:00Z        Barcelona                        -1   
2  2016-01-01T00:00:00Z        Barcelona                        -1   
3  2016-01-01T00:00:00Z        Barcelona                        -1   
4  2017-01-01T00:00:00Z        Barcelona                        -1   

  Dim-01:TERRITORI (type) Dim-02:TIPUS DE ETAPA      VALUE  
0                Municipi              Connexió    80657.0  
1                Municipi               Interna  3280032.0  
2                Municipi              Connexió    58655.0  
3                Municipi               Interna  2385291.0  
4                Municipi               Interna  2287812.0  


## Clean dataset

In [18]:
# 🧹 Step 4 (Optional): Clean your dataset

# Example: clean datetime
if 'Dim-00:TEMPS' in df.columns:
    df['Dim-00:TEMPS'] = pd.to_datetime(df['Dim-00:TEMPS'], errors='coerce') \
                            .dt.strftime('%Y-%m-%dT%H:%M:%S')

# Ensure numeric columns are properly parsed
for col in df.columns:
    if col.upper() == "VALUE":
        df[col] = pd.to_numeric(df[col], errors='coerce')


## Create/update dataset

In [7]:
dataset = get_or_create_dataset(
    ckan,
    dataset_name=DATASET_NAME,
    title=DATASET_TITLE,
    notes=DATASET_DESCRIPTION,
    org_id=ORG_INFO["id"]  # optional
)


ℹ️ Dataset 'walking_trips_1' not found. Creating it...
✅ Created dataset 'walking_trips_1'.


## Upload dataset

In [11]:
upload_or_update_resource(
    ckan,
    dataset_id=dataset["id"],
    file_path=CSV_FILE,
    name=f"{DATASET_TITLE} (CSV)",
    fmt="CSV"
)

upload_or_update_resource(
    ckan,
    dataset_id=dataset["id"],
    file_path=PARQUET_FILE,
    name=f"{DATASET_TITLE} (Parquet)",
    fmt="Parquet"
)


🔄 Updated existing resource: 9b629af1-a214-4f8a-b66f-9a38c2803735
✅ Created new resource: 0d41b7d1-4b88-4576-967f-70802862ad40


{'cache_last_updated': None,
 'cache_url': None,
 'created': '2025-06-25T10:52:14.994161',
 'datastore_active': False,
 'description': None,
 'format': 'Parquet',
 'hash': '',
 'id': '0d41b7d1-4b88-4576-967f-70802862ad40',
 'last_modified': '2025-06-25T10:52:14.978580',
 'metadata_modified': '2025-06-25T10:52:14.991998',
 'mimetype': None,
 'mimetype_inner': None,
 'name': 'Walking trips (Parquet)',
 'package_id': '4b4b0fe0-6f2a-49a5-bc2a-d7c931ed1eee',
 'position': 2,
 'resource_type': None,
 'size': 4985,
 'state': 'active',
 'url': 'https://reallocate-ckan.iti.gr:443/dataset/4b4b0fe0-6f2a-49a5-bc2a-d7c931ed1eee/resource/0d41b7d1-4b88-4576-967f-70802862ad40/download/walking_trips_1.parquet',
 'url_type': 'upload'}