# 01 Data Preparation

Load and clean the OSM-derived grid data, then package it for remote runs.


In [1]:
from pathlib import Path
import sys
import os
import pypsa
import numpy as np

def find_repo_root(max_up=6):
    p = Path.cwd().resolve()
    for _ in range(max_up):
        if (p / 'README.md').exists() or (p / '.git').exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    return Path.cwd().resolve()

repo_root = find_repo_root()
src_path = repo_root / 'src/'
if str(src_path) not in sys.path:
    sys.path.insert(1, str(src_path))
print(f"Using src path: {src_path}")
print(f"Repository root: {repo_root}")

import pypsa_simplified as ps

Using src path: /Users/jedrek/Documents/Studium Volkswirschaftslehre/3. Semester/European Energy Policy/HA/PyPSA---Simplified-European-Model/PyPSA---Simplified-European-Model/src
Repository root: /Users/jedrek/Documents/Studium Volkswirschaftslehre/3. Semester/European Energy Policy/HA/PyPSA---Simplified-European-Model/PyPSA---Simplified-European-Model


## Data sources and parsing
- **OSM prebuilt electricity network** (`data/raw/OSM Prebuilt Electricity Network/`): buses, lines, links, converters, transformers.
- **Custom CSV parsing**: `prepare_osm_source` uses a geometry-safe loader (handles commas inside WKT) to keep column counts correct.
- **Endpoint extraction**: First/last coordinates are pulled from WKT to map line/link endpoints to buses (tolerance 1e-5 degrees).
- **Country filter**: Defaults to DE/FR/PL/AT/IT; adjust via `countries` if needed.


In [2]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import pandas as pd
from pypsa_simplified import prepare_osm_source
from pypsa_simplified import prepare_generator_data


osm_dir = repo_root / "data" / "raw" / "OSM Prebuilt Electricity Network"
sources = prepare_osm_source(osm_dir)
print({k: v.shape if hasattr(v, 'shape') else v for k, v in sources.items()})

RawData = ps.data_prep.RawData(sources)

source = prepare_generator_data(repo_root / "data" / "raw" / "powerplants.csv")

RawData.data['generators'] = source

{'buses': (6737, 10), 'lines': (8994, 16), 'converters': (67, 7), 'links': (38, 10), 'transformers': (875, 8), 'generators': None, 'loads': None, 'storage': None}


In [3]:
RawData.data['buses']

Unnamed: 0,bus_id,voltage,dc,symbol,under_construction,tags,x,y,country,geometry
0,AL1-220,220.0,f,Substation,f,AL1,20.111702,42.096798,AL,POINT (20.111702399999995 42.09679778825765)
1,AL10-220,220.0,f,Substation,f,AL10,19.522149,40.478444,AL,POINT (19.522149214338864 40.47844442711058)
2,AL2-220,220.0,f,Substation,f,AL2,20.031168,42.069050,AL,POINT (20.031168199999993 42.06905038823308)
3,AL3-400,400.0,f,Substation,f,AL3,19.652495,42.010831,AL,POINT (19.652494799999996 42.01083068818158)
4,AL4-220,220.0,f,Substation,f,AL4,19.972850,41.584771,AL,POINT (19.9728498 41.58477058780021)
...,...,...,...,...,...,...,...,...,...,...
6732,way/99694910-380,380.0,f,Substation,f,way/99694910,7.789278,44.462732,IT,POINT (7.7892782005302115 44.46273179679329)
6733,way/99722046-225,225.0,f,Substation,f,way/99722046,-0.114171,47.206984,FR,POINT (-0.1141712110057602 47.206983706128305)
6734,way/99722046-400,400.0,f,Substation,f,way/99722046,-0.113930,47.207199,FR,POINT (-0.113930394824857 47.20719863601106)
6735,way/99826025-220,220.0,f,Substation,f,way/99826025,7.785444,44.875461,IT,POINT (7.785443587366926 44.87546070934813)


In [4]:
print(np.format_float_positional(1e-6, trim='-'))
len(str(np.format_float_positional(1e-6, trim='-')).split(".")[1])

0.000001


6

## Serialize for remote processing
The serialized artifact is compact (gzip + pickle) and ready to `scp` to the server.


In [5]:
# Save osm data source inputs
written = RawData.save(output_path=repo_root / "data" / "processed" / "osm_source_data.pkl.gz")
print(f"RawData.save wrote: {written}")

RawData.save wrote: /Users/jedrek/Documents/Studium Volkswirschaftslehre/3. Semester/European Energy Policy/HA/PyPSA---Simplified-European-Model/PyPSA---Simplified-European-Model/data/processed/osm_source_data.pkl.gz


In [6]:
RawData = ps.data_prep.RawData(None)
load_path = repo_root / "data" / "processed" / "osm_source_data.pkl.pkl.gz"
RawData.load(input_path=written)
print(f"Loaded RawData from: {written}")

Loaded RawData from: /Users/jedrek/Documents/Studium Volkswirschaftslehre/3. Semester/European Energy Policy/HA/PyPSA---Simplified-European-Model/PyPSA---Simplified-European-Model/data/processed/osm_source_data.pkl.gz


In [7]:
# Load parquet from:
parquet_path = repo_root / "data" / "processed" / "jrc_population_nonzero.parquet"
# as pandas dataframe
jrc_pop = pd.read_parquet(parquet_path)
print(f"Loaded JRC population data from: {parquet_path}, shape: {jrc_pop.shape}")

Loaded JRC population data from: /Users/jedrek/Documents/Studium Volkswirschaftslehre/3. Semester/European Energy Policy/HA/PyPSA---Simplified-European-Model/PyPSA---Simplified-European-Model/data/processed/jrc_population_nonzero.parquet, shape: (34363796, 3)


In [8]:
# Sort by lat and then lon
jrc_pop = jrc_pop.sort_values(by=['lat', 'lon']).reset_index(drop=True)
jrc_pop.head()

Unnamed: 0,lon,lat,population
0,-17.979866,27.639805,7
1,-17.982065,27.640021,5
2,-17.978908,27.640127,16
3,-17.984263,27.640237,30
4,-17.983305,27.640559,13


## Create a network
Use the RawData to create the first network

In [None]:
snapshots = pd.date_range("2024-01-01", "2024-12-31 23:00", freq="h")
NETWORK_METADATA = {
    'name': 'Simplified European Electricity Network',
    'snapshots': snapshots,
    'countries': None,
    'generation_carriers': None,
    'transmission_carriers': None,
}

In [None]:
n = pypsa.Network()
n = ps.build_network(n, RawData, options=NETWORK_METADATA)



### Next
- Use `notebooks/main.ipynb` to transfer the artifact and trigger the remote optimization.
- For custom country lists or tolerance, pass `countries`/`tol` to `prepare_osm_source`.
