# Import a dataset to REALLOCATE CKAN

In [6]:
from ckanapi import RemoteCKAN
from dotenv import load_dotenv
import os
import pandas as pd

In [25]:
# ⚙️ Step 2: Set up connection and metadata
FILE_NAME = "cleaned_file.csv"

CKAN_URL = "https://reallocate-ckan.iti.gr"
API_KEY = os.getenv("REALLOCATE_KEY")
DATASET_NAME = "Test-parquet"  # must exist, or you can create it

ORG_INFO = ckan.action.organization_show(id="bsc")

ckan = RemoteCKAN(CKAN_URL, apikey=API_KEY)


In [21]:
def get_or_create_dataset(ckan, dataset_name, title=None, notes="", org_id=None):
    """
    Get or create a CKAN dataset by name.
    """
    try:
        dataset = ckan.action.package_show(id=dataset_name)
        print(f"✅ Dataset '{dataset_name}' already exists.")
    except:
        print(f"ℹ️ Dataset '{dataset_name}' not found. Creating it...")
        create_kwargs = {
            "name": dataset_name,
            "title": title or dataset_name,
            "notes": notes,
            "private": True
        }
        if org_id:
            create_kwargs["owner_org"] = org_id
        dataset = ckan.action.package_create(**create_kwargs)
        print(f"✅ Created dataset '{dataset_name}'.")
    return dataset


## Load data

In [11]:
df = pd.read_csv(FILE_NAME)

df.head()

Unnamed: 0,Dim-00:TEMPS,Dim-01:TERRITORI,Dim-01:TERRITORI (order),Dim-01:TERRITORI (type),Dim-02:TIPUS DE ETAPA,VALUE
0,2015-01-01T00:00:00,Barcelona,-1,Municipi,Connexió,80657.0
1,2015-01-01T00:00:00,Barcelona,-1,Municipi,Interna,3280032.0
2,2016-01-01T00:00:00,Barcelona,-1,Municipi,Connexió,58655.0
3,2016-01-01T00:00:00,Barcelona,-1,Municipi,Interna,2385291.0
4,2017-01-01T00:00:00,Barcelona,-1,Municipi,Interna,2287812.0


## Clean dataset

In [9]:
# 🧹 Step 4 (Optional): Clean your dataset

# Example: clean datetime
if 'Dim-00:TEMPS' in df.columns:
    df['Dim-00:TEMPS'] = pd.to_datetime(df['Dim-00:TEMPS'], errors='coerce') \
                            .dt.strftime('%Y-%m-%dT%H:%M:%S')

# Ensure numeric columns are properly parsed
for col in df.columns:
    if col.upper() == "VALUE":
        df[col] = pd.to_numeric(df[col], errors='coerce')


## Create/update dataset

In [26]:
dataset = get_or_create_dataset(
    ckan,
    dataset_name="cycling-counts-bcn",
    title="Cycling Counts in Barcelona",
    notes="Bike counter data exported from municipal records",
    org_id=ORG_INFO["id"]  # optional
)


ℹ️ Dataset 'cycling-counts-bcn' not found. Creating it...
✅ Created dataset 'cycling-counts-bcn'.


## Upload dataset

In [28]:
# ✅ Add data to CKAN: upload file and push to DataStore

from pathlib import Path

# 🔧 Set local file and metadata
FILE_PATH = "cleaned_file.csv"  # or your_file.parquet
RESOURCE_NAME = "Cycling Data 2023"

# 📁 1. Upload file as resource (preserves original format)
resource = ckan.action.resource_create(
    package_id=dataset["id"],
    name=RESOURCE_NAME,
    format=Path(FILE_PATH).suffix[1:].upper(),  # auto-detect format
    upload=open(FILE_PATH, "rb")
)
print(f"✅ File uploaded as resource: {resource['id']}")

# 🧠 2. Upload structured data to DataStore
# Load into DataFrame (already done in your notebook as `df`)

# Create CKAN field definitions from df types
fields = []
for col in df.columns:
    dtype = df[col].dtype
    if pd.api.types.is_numeric_dtype(dtype):
        ftype = "numeric"
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        ftype = "timestamp"
    else:
        ftype = "text"
    fields.append({"id": col, "type": ftype})

# Convert to CKAN records format
records = df.to_dict(orient='records')

# Upload to DataStore (overwrites if already exists)
ckan.action.datastore_create(
    resource_id=resource["id"],
    fields=fields,
    records=records,
    force=True
)
print(f"✅ Data uploaded to DataStore: {len(records)} records")

✅ File uploaded as resource: 65b382e5-351a-4751-8cb2-9537f9fb8017
✅ Data uploaded to DataStore: 18 records
