# Import a dataset to REALLOCATE CKAN

In [2]:
from ckanapi import RemoteCKAN
from dotenv import load_dotenv
import os
import pandas as pd
from pathlib import Path
import requests
from io import StringIO

  import pkg_resources


In [3]:
# ⚙️ Step 2: Set up connection and metadata

#Specific
DATASET_NAME = "trips_walking"
DATASET_TITLE = "Trips walking"
DATASET_DESCRIPTION = "Walking trips by year from the EMEF data"
DATASET_ID = "wc9hkmubl7"
DATASET_FORMAT = "CSV"
DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"

#General
LOCATION = "../data/"
CSV_FILE = f"{LOCATION}{DATASET_NAME}.csv"
PARQUET_FILE = f"{LOCATION}{DATASET_NAME}.parquet"

#Reallocate params
REALLOCATE_URL = "https://reallocate-ckan.iti.gr"
REALLOCATE_KEY = os.getenv("REALLOCATE_KEY")
ckan = RemoteCKAN(REALLOCATE_URL, apikey=REALLOCATE_KEY)
ORG_INFO = ckan.action.organization_show(id="bsc")

#Open data params
API_KEY = os.getenv("API_KEY")
API_URL = "https://portaldades.ajuntament.barcelona.cat/services/backend/rest/statistic/export"
# API_URL = "https://portaldades.ajuntament.barcelona.cat/services/backend/rest/microdata/export"
headers = {'X-IBM-Client-Id': API_KEY}






ConnectTimeout: HTTPSConnectionPool(host='reallocate-ckan.iti.gr', port=443): Max retries exceeded with url: /api/action/organization_show (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x106b85f10>, 'Connection to reallocate-ckan.iti.gr timed out. (connect timeout=None)'))

Specific Parameters History:
- Trips walking:
    ```Python
    DATASET_NAME = "trips_walking"
    DATASET_TITLE = "Trips walking"
    DATASET_DESCRIPTION = "Walking trips by year from the EMEF data"
    DATASET_ID = "wc9hkmubl7"
    DATASET_FORMAT = "CSV"
    DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"
    ```
- Bike & PMR:
    ```Python
    DATASET_NAME = "bike_and_pmv_by_sex"
    DATASET_TITLE = "Bike & PMV by sex"
    DATASET_DESCRIPTION = "Bike & PMV by sex from OD Barcelona"
    DATASET_ID = "5cid3dkbbx"
    DATASET_FORMAT = "CSV"
    DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"
    ```
- Public Accesibility
    ```Python
    DATASET_NAME = "streets_accessibility"
    DATASET_TITLE = "Streets' Accessibility"
    DATASET_DESCRIPTION = "Streets' accessibility in Barcelona city"
    DATASET_ID = "157b8ef7-e437-4233-a684-edd440b9d3"
    DATASET_FORMAT = "CSV"
    DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"
    ```


In [46]:
def get_or_create_dataset(ckan, dataset_name, title=None, notes="", org_id=None):
    """
    Get or create a CKAN dataset by name.
    """
    try:
        dataset = ckan.action.package_show(id=dataset_name)
        print(f"✅ Dataset '{dataset_name}' already exists.")
    except:
        print(f"ℹ️ Dataset '{dataset_name}' not found. Creating it...")
        create_kwargs = {
            "name": dataset_name,
            "title": title or dataset_name,
            "notes": notes,
            "private": True
        }
        if org_id:
            create_kwargs["owner_org"] = org_id
        dataset = ckan.action.package_create(**create_kwargs)
        print(f"✅ Created dataset '{dataset_name}'.")
    return dataset


In [47]:
import os

def upload_or_update_resource(ckan, dataset_id, file_path, name, fmt):
    """
    If a resource with this name exists in the dataset, update it.
    Otherwise, create a new one.
    """
    # Try to find an existing resource
    existing_resource = None
    dataset = ckan.action.package_show(id=dataset_id)
    for res in dataset["resources"]:
        if res["name"] == name:
            existing_resource = res
            break

    with open(file_path, "rb") as f:
        upload_data = {
            "name": name,
            "format": fmt,
            "upload": f
        }

        if existing_resource:
            # Update existing resource
            upload_data["id"] = existing_resource["id"]
            res = ckan.action.resource_update(**upload_data)
            print(f"🔄 Updated existing resource: {res['id']}")
        else:
            # Create new resource
            upload_data["package_id"] = dataset_id
            res = ckan.action.resource_create(**upload_data)
            print(f"✅ Created new resource: {res['id']}")

    return res


## Load data

In [48]:
# Constants

# Fetch CSV content
data_response = requests.get((API_URL + DATASET_URL), headers=headers)

# Manually decode with correct encoding
decoded_text = data_response.content.decode('utf-8')  # or 'cp1252' if needed

# Load into StringIO for pandas
csv_file = StringIO(decoded_text)
# df = pd.read_csv(f"{LOCATION}inca_od_2022.csv")

# # Preview the data

# print(df.columns)
# df = df[['Gis_ID', 'N_Gis', 'Gis_X', 'Gis_Y', "Data d'Alta", 'Data de Baixa',
#        'Situació', 
#        'Barri', 'Districte', 'Tipus', 'Quantitat', 'Mesura', ]]
df.head()


Unnamed: 0,Gis_ID,N_Gis,Gis_X,Gis_Y,Data d'Alta,Data de Baixa,Situació,Barri,Districte,Tipus,Quantitat,Mesura
0,53,1,428568.7713,4578089.0,13/4/2018 0:00:00,,Tram,12. la Marina del Prat Vermell,03. Sants-Montjuic,Graons/Resalts,,
1,54,2,428561.4783,4578085.0,13/4/2018 0:00:00,,Cruïlla,12. la Marina del Prat Vermell,03. Sants-Montjuic,,,
2,55,3,428547.7503,4578095.0,13/4/2018 0:00:00,,Cruïlla,12. la Marina del Prat Vermell,03. Sants-Montjuic,,,400.0
3,75,4,431202.6671,4582812.0,16/4/2018 0:00:00,,Cruïlla,05. el Fort Pienc,02. Eixample,Accessible,,
4,76,5,431179.358,4582835.0,16/4/2018 0:00:00,,Cruïlla,05. el Fort Pienc,02. Eixample,Accessible,,


## Clean dataset

In [49]:
# 🧹 Step 4 (Optional): Clean your dataset

# Example: clean datetime
if 'Dim-00:TEMPS' in df.columns:
    df['Dim-00:TEMPS'] = pd.to_datetime(df['Dim-00:TEMPS'], errors='coerce') \
                            .dt.strftime('%Y-%m-%dT%H:%M:%S')

# Ensure numeric columns are properly parsed
for col in df.columns:
    if col.upper() == "VALUE":
        df[col] = pd.to_numeric(df[col], errors='coerce')


## Save 

In [50]:
df.to_csv(CSV_FILE, index=False)
df.to_parquet(PARQUET_FILE, index=False)

## Create/update dataset

In [51]:
dataset = get_or_create_dataset(
    ckan,
    dataset_name=DATASET_NAME,
    title=DATASET_TITLE,
    notes=DATASET_DESCRIPTION,
    org_id=ORG_INFO["id"]  # optional
)


✅ Dataset 'trips_walking' already exists.


## Upload dataset

In [52]:
upload_or_update_resource(
    ckan,
    dataset_id=dataset["id"],
    file_path=CSV_FILE,
    name=f"{DATASET_TITLE} (CSV)",
    fmt="CSV"
)

🔄 Updated existing resource: 619673e7-3b51-4bf5-b0ca-bc035eb4136e


{'cache_last_updated': None,
 'cache_url': None,
 'created': '2025-06-25T12:18:24.561934',
 'datastore_active': True,
 'description': None,
 'format': 'CSV',
 'hash': '',
 'id': '619673e7-3b51-4bf5-b0ca-bc035eb4136e',
 'last_modified': '2025-06-25T15:34:49.293048',
 'metadata_modified': '2025-06-25T15:34:49.301497',
 'mimetype': 'text/csv',
 'mimetype_inner': None,
 'name': 'Trips walking (CSV)',
 'package_id': '6f32fa1e-459a-48d3-891a-234c98984b7f',
 'position': 0,
 'resource_type': None,
 'size': 18898340,
 'state': 'active',
 'url': 'https://reallocate-ckan.iti.gr:443/dataset/6f32fa1e-459a-48d3-891a-234c98984b7f/resource/619673e7-3b51-4bf5-b0ca-bc035eb4136e/download/trips_walking.csv',
 'url_type': 'upload'}

In [53]:
upload_or_update_resource(
    ckan,
    dataset_id=dataset["id"],
    file_path=PARQUET_FILE,
    name=f"{DATASET_TITLE} (Parquet)",
    fmt="Parquet"
)

🔄 Updated existing resource: da447b40-2523-41ef-a95b-dbad84ad2320


{'cache_last_updated': None,
 'cache_url': None,
 'created': '2025-06-25T12:18:28.220333',
 'datastore_active': False,
 'description': None,
 'format': 'Parquet',
 'hash': '',
 'id': 'da447b40-2523-41ef-a95b-dbad84ad2320',
 'last_modified': '2025-06-25T15:34:54.831819',
 'metadata_modified': '2025-06-25T15:34:54.839855',
 'mimetype': None,
 'mimetype_inner': None,
 'name': 'Trips walking (Parquet)',
 'package_id': '6f32fa1e-459a-48d3-891a-234c98984b7f',
 'position': 1,
 'resource_type': None,
 'size': 5086933,
 'state': 'active',
 'url': 'https://reallocate-ckan.iti.gr:443/dataset/6f32fa1e-459a-48d3-891a-234c98984b7f/resource/da447b40-2523-41ef-a95b-dbad84ad2320/download/trips_walking.parquet',
 'url_type': 'upload'}