# Import a dataset to REALLOCATE CKAN

In [1]:
from ckanapi import RemoteCKAN
from dotenv import load_dotenv
import os
import pandas as pd
from pathlib import Path
import requests
from io import StringIO, BytesIO # ADD BytesIO here

  import pkg_resources


In [2]:
# ⚙️ Step 2: Set up connection and metadata

#Specific
DATASET_NAME = "trips_walking"
DATASET_TITLE = "Trips walking"
DATASET_DESCRIPTION = "Walking trips by year from the EMEF data"
DATASET_ID = "wc9hkmubl7"
DATASET_FORMAT = "CSV"
DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"

#General
LOCATION = "../data/"
#CSV_FILE = f"{LOCATION}{DATASET_NAME}.csv"
#PARQUET_FILE = f"{LOCATION}{DATASET_NAME}.parquet"

#Reallocate params
REALLOCATE_URL = "https://reallocate-ckan.iti.gr"
REALLOCATE_KEY = os.getenv("REALLOCATE_KEY")
ckan = RemoteCKAN(REALLOCATE_URL, apikey=REALLOCATE_KEY)
ORG_INFO = ckan.action.organization_show(id="bsc")

#Open data params
API_KEY = os.getenv("API_KEY")
API_URL = "https://portaldades.ajuntament.barcelona.cat/services/backend/rest/statistic/export"
# API_URL = "https://portaldades.ajuntament.barcelona.cat/services/backend/rest/microdata/export"
headers = {'X-IBM-Client-Id': API_KEY}






Specific Parameters History:
- Trips walking:
    ```Python
    DATASET_NAME = "trips_walking"
    DATASET_TITLE = "Trips walking"
    DATASET_DESCRIPTION = "Walking trips by year from the EMEF data"
    DATASET_ID = "wc9hkmubl7"
    DATASET_FORMAT = "CSV"
    DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"
    ```
- Bike & PMR:
    ```Python
    DATASET_NAME = "bike_and_pmv_by_sex"
    DATASET_TITLE = "Bike & PMV by sex"
    DATASET_DESCRIPTION = "Bike & PMV by sex from OD Barcelona"
    DATASET_ID = "5cid3dkbbx"
    DATASET_FORMAT = "CSV"
    DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"
    ```
- Public Accesibility
    ```Python
    DATASET_NAME = "streets_accessibility"
    DATASET_TITLE = "Streets' Accessibility"
    DATASET_DESCRIPTION = "Streets' accessibility in Barcelona city"
    DATASET_ID = "157b8ef7-e437-4233-a684-edd440b9d3"
    DATASET_FORMAT = "CSV"
    DATASET_URL = f"?id={DATASET_ID}&fileformat={DATASET_FORMAT}"
    ```


In [3]:
def get_or_create_dataset(ckan, dataset_name, title=None, notes="", org_id=None):
    """
    Get or create a CKAN dataset by name.
    """
    try:
        dataset = ckan.action.package_show(id=dataset_name)
        print(f"✅ Dataset '{dataset_name}' already exists.")
    except:
        print(f"ℹ️ Dataset '{dataset_name}' not found. Creating it...")
        create_kwargs = {
            "name": dataset_name,
            "title": title or dataset_name,
            "notes": notes,
            "private": True
        }
        if org_id:
            create_kwargs["owner_org"] = org_id
        dataset = ckan.action.package_create(**create_kwargs)
        print(f"✅ Created dataset '{dataset_name}'.")
    return dataset


In [4]:
import os
from io import StringIO, BytesIO # Ensure these are imported at the top of your notebook

def upload_or_update_resource(ckan, dataset_id, file_content, name, fmt): # <--- Make sure this parameter is 'file_content'
    """
    If a resource with this name exists in the dataset, update it.
    Otherwise, create a new one.
    file_content can be a file path (string) or a file-like object (StringIO, BytesIO).
    """
    # Try to find an existing resource
    existing_resource = None
    dataset = ckan.action.package_show(id=dataset_id)
    for res in dataset["resources"]:
        if res["name"] == name:
            existing_resource = res
            break

    # Determine how to get the upload data
    upload_file_object = None
    if isinstance(file_content, (StringIO, BytesIO)):
        # If it's already a file-like object, use it directly
        upload_file_object = file_content
    elif isinstance(file_content, str) and os.path.exists(file_content):
        # If it's a string path to an existing file, open it
        upload_file_object = open(file_content, "rb")
    else:
        raise ValueError("file_content must be a valid file path or a file-like object (StringIO/BytesIO).")

    try:
        upload_data = {
            "name": name,
            "format": fmt,
            "upload": upload_file_object # Pass the file-like object
        }

        if existing_resource:
            # Update existing resource
            upload_data["id"] = existing_resource["id"]
            res = ckan.action.resource_update(**upload_data)
            print(f"🔄 Updated existing resource: {res['id']}")
        else:
            # Create new resource
            upload_data["package_id"] = dataset_id
            res = ckan.action.resource_create(**upload_data)
            print(f"✅ Created new resource: {res['id']}")

    finally:
        # If we opened a file, make sure to close it
        if isinstance(file_content, str) and os.path.exists(file_content) and upload_file_object:
            upload_file_object.close()

    return res

## Load data

In [5]:
# Constants

# Fetch CSV content
data_response = requests.get((API_URL + DATASET_URL), headers=headers)

# Manually decode with correct encoding
decoded_text = data_response.content.decode('utf-8')  # or 'cp1252' if needed

# Load into StringIO for pandas
csv_file = StringIO(decoded_text)
df = pd.read_csv(csv_file)

# # Preview the data

# print(df.columns)
# df = df[['Gis_ID', 'N_Gis', 'Gis_X', 'Gis_Y', "Data d'Alta", 'Data de Baixa',
#        'Situació', 
#        'Barri', 'Districte', 'Tipus', 'Quantitat', 'Mesura', ]]
df.head()


Unnamed: 0,Dim-00:TEMPS,Dim-01:TERRITORI,Dim-01:TERRITORI (order),Dim-01:TERRITORI (type),Dim-02:TIPUS DE ETAPA,VALUE
0,2015-01-01T00:00:00Z,Barcelona,-1,Municipi,Connexió,80657.0
1,2015-01-01T00:00:00Z,Barcelona,-1,Municipi,Interna,3280032.0
2,2016-01-01T00:00:00Z,Barcelona,-1,Municipi,Connexió,58655.0
3,2016-01-01T00:00:00Z,Barcelona,-1,Municipi,Interna,2385291.0
4,2017-01-01T00:00:00Z,Barcelona,-1,Municipi,Interna,2287812.0


## Clean dataset

In [6]:
# 🧹 Step 4 (Optional): Clean your dataset

# Example: clean datetime
if 'Dim-00:TEMPS' in df.columns:
    df['Dim-00:TEMPS'] = pd.to_datetime(df['Dim-00:TEMPS'], errors='coerce') \
                            .dt.strftime('%Y-%m-%dT%H:%M:%S')

# Ensure numeric columns are properly parsed
for col in df.columns:
    if col.upper() == "VALUE":
        df[col] = pd.to_numeric(df[col], errors='coerce')


## Save 

In [7]:
# ## Save
# Convert DataFrame to in-memory CSV string
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0) # Rewind the buffer to the beginning

# Convert DataFrame to in-memory Parquet bytes
parquet_buffer = BytesIO()
df.to_parquet(parquet_buffer, index=False)
parquet_buffer.seek(0) # Rewind the buffer to the beginning

print("✅ Data converted to in-memory CSV and Parquet buffers.")

✅ Data converted to in-memory CSV and Parquet buffers.


## Create/update dataset

In [8]:
dataset = get_or_create_dataset(
    ckan,
    dataset_name=DATASET_NAME,
    title=DATASET_TITLE,
    notes=DATASET_DESCRIPTION,
    org_id=ORG_INFO["id"]  # optional
)


✅ Dataset 'trips_walking' already exists.


## Upload dataset

### CSV

In [11]:
# ## Upload dataset

# Upload CSV resource
# Use the in-memory csv_buffer directly for upload
upload_or_update_resource(
    ckan,
    dataset_id=dataset["id"],
    file_content =csv_buffer, # Pass the StringIO object
    name=f"{DATASET_TITLE} (CSV)",
    fmt="CSV"
)

# Upload Parquet resource
# Use the in-memory parquet_buffer directly for upload
upload_or_update_resource(
    ckan,
    dataset_id=dataset["id"],
    file_content =parquet_buffer, # Pass the BytesIO object
    name=f"{DATASET_TITLE} (Parquet)",
    fmt="Parquet"
)

print("✅ Both CSV and Parquet resources uploaded directly from memory.")

🔄 Updated existing resource: 619673e7-3b51-4bf5-b0ca-bc035eb4136e
🔄 Updated existing resource: da447b40-2523-41ef-a95b-dbad84ad2320
✅ Both CSV and Parquet resources uploaded directly from memory.
