In [1]:
path_metadata = "data/metadata.json"

In [2]:
import json
import csv
from pathlib import Path

path_metadata = Path("data/metadata.json")  # adapte si besoin
out_path = Path("data/metadata.csv")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

with out_path.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # en-tête
    writer.writerow(["File name"] + [k for k in data.keys() if k != "File name"])
    # lignes
    file_names = data.get("File name", [])
    n = len(file_names)
    for i in range(n):
        row = [data.get("File name", [])[i]]
        for k in data.keys():
            if k == "File name":
                continue
            row.append(data.get(k, [None]*n)[i] if isinstance(data.get(k), list) else data.get(k))
        writer.writerow(row)

print("ok ->", out_path)


ok -> data/metadata.csv


In [3]:
import json
import csv
import unicodedata
import re
from pathlib import Path

path_metadata = Path("./data/metadata.json")
dir_fantoir = Path("./data/fantoir_communes")
out_path = Path("./data/metadata_insee.json")

def normalize(s):
    s = s or ""
    s = s.lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s).strip()
    return s

def slugify(s):
    s = normalize(s)
    s = s.replace("'", " ")
    s = re.sub(r"[^a-z0-9]+", "_", s).strip("_")
    return s

def find_fantoir_file(city):
    # ex: "L'Isle-sur-la-Sorgue" -> isle_sur_sorgue_fantoir.csv
    base = slugify(city)
    candidates = list(dir_fantoir.glob(f"{base}*.csv"))
    if candidates:
        return candidates[0]
    # fallback: drop leading article (le/la/l)
    for prefix in ("le_", "la_", "l_"):
        if base.startswith(prefix):
            base2 = base[len(prefix):]
            candidates = list(dir_fantoir.glob(f"{base2}*.csv"))
            if candidates:
                return candidates[0]
    return None

def code_insee_from_fantoir(city, edifice):
    path = find_fantoir_file(city)
    if not path:
        return None
    target = normalize(edifice)
    if not target:
        return None
    with path.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            mot = normalize(row.get("mot_directeur"))
            if target in mot or mot in target:
                ident = row.get("id", "")
                if ident and "-" in ident:
                    return ident.split("-", 1)[0]  # code INSEE
    return None

data = json.loads(path_metadata.read_text(encoding="utf-8"))

# on gère "Édifice" ou "édifice"
key_edifice = "Édifice" if "Édifice" in data else "édifice"

file_names = data.get("File name", [])
n = len(file_names)

codes = []
for i in range(n):
    edifice = data.get(key_edifice, [""] * n)[i]
    if not edifice:
        codes.append("")
        continue
    city = data.get("City", [""] * n)[i]
    code = code_insee_from_fantoir(city, edifice)
    codes.append(code or "")

data["code_insee"] = codes

out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print("ok ->", out_path)


ok -> data/metadata_insee.json


In [4]:
import json
import csv
from pathlib import Path

in_path = Path("./data/metadata_insee.json")
out_path = Path("./data/metadata_insee.csv")

data = json.loads(in_path.read_text(encoding="utf-8"))

# Les colonnes sont les clés du dict
fields = list(data.keys())

# Nombre de lignes basé sur "File name"
n = len(data.get("File name", []))

with out_path.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for i in range(n):
        row = []
        for k in fields:
            v = data.get(k, "")
            if isinstance(v, list):
                row.append(v[i] if i < len(v) else "")
            else:
                row.append(v)
        writer.writerow(row)

print("ok ->", out_path)


ok -> data/metadata_insee.csv


In [5]:
import json
import csv
import unicodedata
import re
from pathlib import Path

path_metadata = Path("data/metadata.json")
dir_fantoir = Path("data/fantoir_communes")
out_path = Path("data/metadata_insee.csv")

def normalize(s):
    s = (s or "").lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s).strip()
    return s

def slugify(s):
    s = normalize(s).replace("'", " ")
    s = re.sub(r"[^a-z0-9]+", "_", s).strip("_")
    return s

def find_fantoir_file(city):
    base = slugify(city)
    candidates = list(dir_fantoir.glob(f"{base}*.csv"))
    if candidates:
        return candidates[0]
    for prefix in ("le_", "la_", "l_"):
        if base.startswith(prefix):
            base2 = base[len(prefix):]
            candidates = list(dir_fantoir.glob(f"{base2}*.csv"))
            if candidates:
                return candidates[0]
    return None

def code_insee_from_fantoir(city, edifice):
    path = find_fantoir_file(city)
    if not path:
        return ""
    target = normalize(edifice)
    if not target:
        return ""
    with path.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            mot = normalize(row.get("mot_directeur"))
            if target in mot or mot in target:
                ident = row.get("id", "")
                if ident and "-" in ident:
                    return ident.split("-", 1)[0]
    return ""

data = json.loads(path_metadata.read_text(encoding="utf-8"))

key_edifice = "Édifice" if "Édifice" in data else "édifice"
file_names = data.get("File name", [])
n = len(file_names)

fields = list(data.keys()) + ["code_insee"]

with out_path.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for i in range(n):
        row = []
        for k in data.keys():
            v = data.get(k, "")
            if isinstance(v, list):
                row.append(v[i] if i < len(v) else "")
            else:
                row.append(v)
        edifice = data.get(key_edifice, [""] * n)[i]
        city = data.get("City", [""] * n)[i]
        code = code_insee_from_fantoir(city, edifice) if edifice else ""
        row.append(code)
        writer.writerow(row)

print("ok ->", out_path)


ok -> data/metadata_insee.csv


### gros échec je v tenter pour une seule ville

In [8]:
path_metadata = "./data/metadata.json"
path_fantoir_apt = "./data/fantoir_communes/apt_fantoir.csv"

In [9]:
import json
import csv
import unicodedata
import re
from pathlib import Path

path_metadata = Path("data/metadata.json")
path_fantoir = Path("data/fantoir_communes/apt_fantoir.csv")

def normalize(s):
    s = (s or "").lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s).strip()
    return s

def find_code_insee_from_matiere(matiere):
    target = normalize(matiere)
    if not target:
        return ""
    with path_fantoir.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            mot = normalize(row.get("mot_directeur"))
            if mot == target:
                ident = row.get("id", "")
                if ident and "-" in ident:
                    return ident.split("-", 1)[0]
    return ""

data = json.loads(path_metadata.read_text(encoding="utf-8"))

key_matiere = "Matière" if "Matière" in data else "matière"
file_names = data.get("File name", [])
n = len(file_names)

codes = []
for i in range(n):
    city = data.get("City", [""] * n)[i]
    matiere = data.get(key_matiere, [""] * n)[i]
    if city == "Apt" and matiere:
        code = find_code_insee_from_matiere(matiere)
        codes.append(code)
    else:
        codes.append("")

data["code_insee"] = codes

path_metadata.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print("ok ->", path_metadata)


ok -> data/metadata.json


In [24]:
import json
import csv
from pathlib import Path

path_metadata = Path("data/metadata.json")  # adapte si besoin
out_path = Path("data/metadata.csv")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

with out_path.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # en-tête
    writer.writerow(["File name"] + [k for k in data.keys() if k != "File name"])
    # lignes
    file_names = data.get("File name", [])
    n = len(file_names)
    for i in range(n):
        row = [data.get("File name", [])[i]]
        for k in data.keys():
            if k == "File name":
                continue
            row.append(data.get(k, [None]*n)[i] if isinstance(data.get(k), list) else data.get(k))
        writer.writerow(row)

print("ok ->", out_path)


ok -> data/metadata.csv


In [11]:
import csv
from pathlib import Path

in_path = Path("data/v_commune_2025.csv")
out_path = Path("data/v_commune_2025_reduit.csv")

with in_path.open("r", encoding="utf-8") as f_in, out_path.open("w", newline="", encoding="utf-8") as f_out:
    reader = csv.DictReader(f_in)
    writer = csv.DictWriter(f_out, fieldnames=["COM", "NCC"])
    writer.writeheader()
    for row in reader:
        writer.writerow({"COM": row.get("COM", ""), "NCC": row.get("NCC", "")})

print("ok ->", out_path)


ok -> data/v_commune_2025_reduit.csv


In [12]:
import csv
from pathlib import Path

in_path = Path("data/v_commune_2025_reduit.csv")
out_path = Path("data/v_commune_2025_reduit_84.csv")

with in_path.open("r", encoding="utf-8") as f_in, out_path.open("w", newline="", encoding="utf-8") as f_out:
    reader = csv.DictReader(f_in)
    writer = csv.DictWriter(f_out, fieldnames=reader.fieldnames)
    writer.writeheader()
    for row in reader:
        com = row.get("COM", "")
        if com.startswith("84"):
            writer.writerow(row)

print("ok ->", out_path)


ok -> data/v_commune_2025_reduit_84.csv


In [13]:
import json
import csv
import unicodedata
import re
from pathlib import Path

path_metadata = Path("data/metadata.json")
path_communes = Path("data/v_commune_2025_reduit_84.csv")

def normalize(s):
    s = (s or "").lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s).strip()
    return s

def strip_parens(s):
    return re.sub(r"\s*\(.*?\)\s*", " ", s or "").strip()

# Map commune name -> code INSEE
name_to_code = {}
with path_communes.open("r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        name = normalize(row.get("NCC"))
        code = row.get("COM", "")
        if name and code:
            name_to_code[name] = code

data = json.loads(path_metadata.read_text(encoding="utf-8"))

geo_names = data.get("Nom géographique", [])
n = len(geo_names)

codes = []
for i in range(n):
    raw = geo_names[i]
    cleaned = strip_parens(raw)
    code = name_to_code.get(normalize(cleaned), "")
    codes.append(code)

data["code_insee"] = codes

path_metadata.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print("ok ->", path_metadata)


ok -> data/metadata.json


In [18]:
import json
from pathlib import Path
import re

path_metadata = Path("data/metadata.json")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

geo = data.get("Nom géographique", [])
codes = data.get("code_insee", [])

missing = set()

for i in range(len(geo)):
    code = codes[i] if i < len(codes) else ""
    if not code:
        # nettoie les parenthèses pour afficher un nom propre
        name = re.sub(r"\s*\(.*?\)\s*", " ", geo[i]).strip()
        if name:
            missing.add(name)

for name in sorted(missing):
    print(name)


Beaumes-de-Venise
Beaumont-de-Pertuis
Cabrières-d'Aigues
Camaret-sur-Aigues
Caumont-sur-Durance
Châteauneuf-du-Pape
Crillon-le-Brave
Dentelles de Montmirail
Entraigues-sur-la-Sorgue
Fontaine-de-Vaucluse
L'Isle-sur-la-Sorgue
La Bastide-des-Jourdans
La Bastidonne
La Tour-d'Aigues
Le Barroux
Le Beaucet
Le Pontet
Le Thor
Morières-lès-Avignon
Pernes-les-Fontaines
Peypin-d'Aigues
Saint-Christol
Saint-Didier
Saint-Martin-de-Castillon
Saint-Martin-de-la-Brasque
Saint-Pantaléon
Saint-Saturnin-lès-Apt
Sainte-Cécile-les-Vignes
Saumanes-de-Vaucluse
Savoillans
Sérignan-du-Comtat
Vaison-la-Romaine
Ventoux
Villes-sur-Auzon


In [None]:
liste_code = ["84012", "84014", "84024","84029","84034","84037","84041","rien","84043","84139","84054","84009","84010","84133","84008","84011","84092","84132","84081","84088","84090","84107","84108","84112","84113","84114","84118","84106","84124","84125","84127","84137","rien","84148"]

In [17]:
len(liste_code)

33

In [19]:
import json
import re
from pathlib import Path

path_metadata = Path("data/metadata.json")

villes = [
    "Beaumes-de-Venise",
    "Beaumont-de-Pertuis",
    "Cabrières-d'Aigues",
    "Camaret-sur-Aigues",
    "Caumont-sur-Durance",
    "Châteauneuf-du-Pape",
    "Crillon-le-Brave",
    "Dentelles de Montmirail",
    "Entraigues-sur-la-Sorgue",
    "Fontaine-de-Vaucluse",
    "L'Isle-sur-la-Sorgue",
    "La Bastide-des-Jourdans",
    "La Bastidonne",
    "La Tour-d'Aigues",
    "Le Barroux",
    "Le Beaucet",
    "Le Pontet",
    "Le Thor",
    "Morières-lès-Avignon",
    "Pernes-les-Fontaines",
    "Peypin-d'Aigues",
    "Saint-Christol",
    "Saint-Didier",
    "Saint-Martin-de-Castillon",
    "Saint-Martin-de-la-Brasque",
    "Saint-Pantaléon",
    "Saint-Saturnin-lès-Apt",
    "Sainte-Cécile-les-Vignes",
    "Saumanes-de-Vaucluse",
    "Savoillans",
    "Sérignan-du-Comtat",
    "Vaison-la-Romaine",
    "Ventoux",
    "Villes-sur-Auzon",
]

codes = [
    "84012", "84014", "84024", "84029", "84034", "84037", "84041",
    "", "84043", "84139", "84054", "84009", "84010", "84133", "84008",
    "84011", "84092", "84132", "84081", "84088", "84090", "84107",
    "84108", "84112", "84113", "84114", "84118", "84106", "84124",
    "84125", "84127", "84137", "", "84148"
]

mapping = dict(zip(villes, codes))

data = json.loads(path_metadata.read_text(encoding="utf-8"))

geo = data.get("Nom géographique", [])
codes_insee = data.get("code_insee", [""] * len(geo))

for i, raw in enumerate(geo):
    if i < len(codes_insee) and codes_insee[i]:
        continue  # déjà renseigné
    name = re.sub(r"\s*\(.*?\)\s*", " ", raw).strip()
    code = mapping.get(name, "")
    if code:
        if i < len(codes_insee):
            codes_insee[i] = code
        else:
            codes_insee.append(code)

data["code_insee"] = codes_insee

path_metadata.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print("ok ->", path_metadata)


ok -> data/metadata.json


In [22]:
import csv
from pathlib import Path

in_path = Path("./data/20230823-communes-departement-region.csv")
out_path = Path("./data/communes_vaucluse_coords.csv")

with in_path.open("r", encoding="utf-8") as f_in, out_path.open("w", newline="", encoding="utf-8") as f_out:
    reader = csv.DictReader(f_in)
    writer = csv.DictWriter(f_out, fieldnames=["code_commune_INSEE", "longitude", "latitude"])
    writer.writeheader()
    for row in reader:
        code = row.get("code_commune_INSEE", "")
        if code.startswith("84"):
            writer.writerow({
                "code_commune_INSEE": code,
                "longitude": row.get("longitude", ""),
                "latitude": row.get("latitude", "")
            })

print("ok ->", out_path)


ok -> data/communes_vaucluse_coords.csv


In [23]:
import json
import csv
from pathlib import Path

path_metadata = Path("data/metadata.json")
path_coords = Path("data/communes_vaucluse_coords.csv")

# Charge la table code_insee -> (lat, lon)
coords = {}
with path_coords.open("r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    # accepte plusieurs variantes de nom de colonne
    for row in reader:
        code = (
            row.get("code_commune_INSEE")
            or row.get("code_commune_insee")
            or row.get("COM")
            or ""
        )
        if code:
            coords[code] = (row.get("latitude", ""), row.get("longitude", ""))

data = json.loads(path_metadata.read_text(encoding="utf-8"))

codes = data.get("code_insee", [])
n = len(codes)

latitudes = []
longitudes = []

for i in range(n):
    code = codes[i]
    lat, lon = coords.get(code, ("", ""))
    latitudes.append(lat)
    longitudes.append(lon)

data["latitude"] = latitudes
data["longitude"] = longitudes

path_metadata.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print("ok ->", path_metadata)


ok -> data/metadata.json


### on va tenter avec librairie osmnx pour monuments

In [26]:
!pip install osmnx

Collecting osmnx
  Downloading osmnx-2.0.7-py3-none-any.whl.metadata (4.9 kB)
Collecting geopandas>=1.0.1 (from osmnx)
  Downloading geopandas-1.1.2-py3-none-any.whl.metadata (2.3 kB)
Collecting networkx>=2.5 (from osmnx)
  Downloading networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting shapely>=2.0 (from osmnx)
  Using cached shapely-2.1.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting pyogrio>=0.7.2 (from geopandas>=1.0.1->osmnx)
  Downloading pyogrio-0.12.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (5.9 kB)
Collecting pyproj>=3.5.0 (from geopandas>=1.0.1->osmnx)
  Downloading pyproj-3.7.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (31 kB)
Downloading osmnx-2.0.7-py3-none-any.whl (101 kB)
Downloading geopandas-1.1.2-py3-none-any.whl (341 kB)
Downloading networkx-3.6.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.5 MB/s[0m  [33m0:00:00[0m
[?25hDownloading pyogrio-0.12.1-cp313-cp313-macosx_1

In [29]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [30]:
import json
import csv
import time
from pathlib import Path
import osmnx as ox
from tqdm import tqdm

path_metadata = Path("data/metadata.json")
out_json = Path("data/metadata_monu.json")
out_csv = Path("data/metadata_monu.csv")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

edifices = data.get("Édifice", [])
cities = data.get("City", [])
geo_names = data.get("Nom géographique", [])
lat_communes = data.get("latitude", [])
lon_communes = data.get("longitude", [])

n = len(edifices)

coords = []
count_done = 0
max_edifices = 50

with tqdm(total=max_edifices, desc="Geocode édifices") as pbar:
    for i in range(n):
        edifice = edifices[i] if i < len(edifices) else ""
        if not edifice:
            coords.append("")
            continue

        if count_done >= max_edifices:
            coords.append("")  # ne pas géocoder au-delà de 50
            continue

        city = cities[i] if i < len(cities) else ""
        if not city:
            city = geo_names[i] if i < len(geo_names) else ""

        if city:
            query = f"{edifice}, {city}, Vaucluse, France"
        else:
            query = f"{edifice}, Vaucluse, France"

        try:
            lat, lon = ox.geocode(query)
            coords.append(f"{lat}, {lon}")
        except Exception:
            lat_c = lat_communes[i] if i < len(lat_communes) else ""
            lon_c = lon_communes[i] if i < len(lon_communes) else ""
            coords.append(f"{lat_c}, {lon_c}" if lat_c and lon_c else "")

        count_done += 1
        pbar.update(1)
        time.sleep(1)

data["coordonnées"] = coords

out_json.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

fields = list(data.keys())
with out_csv.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for i in range(n):
        row = []
        for k in fields:
            v = data.get(k, "")
            if isinstance(v, list):
                row.append(v[i] if i < len(v) else "")
            else:
                row.append(v)
        writer.writerow(row)

print("ok ->", out_json, "and", out_csv)


Geocode édifices: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]

ok -> data/metadata_monu.json and data/metadata_monu.csv





In [31]:
import json
import csv
import time
from pathlib import Path
import osmnx as ox
from tqdm import tqdm

path_metadata = Path("data/test20/metadata_test20_enrichi.json")
out_json = Path("data/test20/metadata_test20_monu.json")
out_csv = Path("data/test20/metadata_test20_monu.csv")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

edifices = data.get("Édifice", [])
cities = data.get("City", [])
geo_names = data.get("Nom géographique", [])
lat_communes = data.get("latitude", [])
lon_communes = data.get("longitude", [])

n = len(edifices)

coords = []
with tqdm(total=n, desc="Geocode édifices") as pbar:
    for i in range(n):
        edifice = edifices[i] if i < len(edifices) else ""
        if not edifice:
            coords.append("")
            pbar.update(1)
            continue

        city = cities[i] if i < len(cities) else ""
        if not city:
            city = geo_names[i] if i < len(geo_names) else ""

        if city:
            query = f"{edifice}, {city}, Vaucluse, France"
        else:
            query = f"{edifice}, Vaucluse, France"

        try:
            lat, lon = ox.geocode(query)
            coords.append(f"{lat}, {lon}")
        except Exception:
            lat_c = lat_communes[i] if i < len(lat_communes) else ""
            lon_c = lon_communes[i] if i < len(lon_communes) else ""
            coords.append(f"{lat_c}, {lon_c}" if lat_c and lon_c else "")

        pbar.update(1)
        time.sleep(1)

data["coordonnées"] = coords

out_json.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

fields = list(data.keys())
with out_csv.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for i in range(n):
        row = []
        for k in fields:
            v = data.get(k, "")
            if isinstance(v, list):
                row.append(v[i] if i < len(v) else "")
            else:
                row.append(v)
        writer.writerow(row)

print("ok ->", out_json, "and", out_csv)


Geocode édifices: 100%|██████████| 20/20 [00:30<00:00,  1.51s/it]

ok -> data/test20/metadata_test20_monu.json and data/test20/metadata_test20_monu.csv





In [32]:
import json
import csv
import time
from pathlib import Path
import osmnx as ox
from tqdm import tqdm

path_metadata = Path("data/test20/metadata_test20.json")
out_json = Path("data/test20/metadata_test20_coor.json")
out_csv = Path("data/test20/metadata_test20_coor.csv")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

geo_names = data.get("Nom géographique", [])
hamlets = data.get("trans_hamlet_uniformise", [])
edifices = data.get("Édifice", [])
monuments = data.get("trans_monument_uniformise", [])
cities = data.get("City", [])

n = len(geo_names)

coords = []
used_source = []

with tqdm(total=n, desc="Geocode") as pbar:
    for i in range(n):
        city = cities[i] if i < len(cities) else ""
        candidates = [
            ("Nom géographique", geo_names[i] if i < len(geo_names) else ""),
            ("trans_hamlet_uniformise", hamlets[i] if i < len(hamlets) else ""),
            ("Édifice", edifices[i] if i < len(edifices) else ""),
            ("trans_monument_uniformise", monuments[i] if i < len(monuments) else ""),
        ]

        result = ""
        source = ""
        for label, value in candidates:
            if not value:
                continue
            if city:
                query = f"{value}, {city}, Vaucluse, France"
            else:
                query = f"{value}, Vaucluse, France"
            try:
                lat, lon = ox.geocode(query)
                result = f"{lat}, {lon}"
                source = label
                break
            except Exception:
                continue

        coords.append(result)
        used_source.append(source)
        pbar.update(1)
        time.sleep(1)

data["coordonnées"] = coords
data["coord_source"] = used_source  # optionnel: savoir quelle colonne a servi

out_json.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

fields = list(data.keys())
with out_csv.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for i in range(n):
        row = []
        for k in fields:
            v = data.get(k, "")
            if isinstance(v, list):
                row.append(v[i] if i < len(v) else "")
            else:
                row.append(v)
        writer.writerow(row)

print("ok ->", out_json, "and", out_csv)


Geocode: 100%|██████████| 20/20 [00:23<00:00,  1.20s/it]

ok -> data/test20/metadata_test20_coor.json and data/test20/metadata_test20_coor.csv





In [33]:
import json
import csv
from pathlib import Path

json_path = Path("data/test20/metadata_test20_coor.json")
csv_path = Path("data/test20/metadata_test20_coor.csv")

# JSON
data = json.loads(json_path.read_text(encoding="utf-8"))
data.pop("latitude", None)
data.pop("longitude", None)
json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

# CSV
with csv_path.open("r", encoding="utf-8") as f_in:
    reader = csv.DictReader(f_in)
    fieldnames = [fn for fn in reader.fieldnames if fn not in ("latitude", "longitude")]
    rows = [{k: v for k, v in row.items() if k in fieldnames} for row in reader]

with csv_path.open("w", newline="", encoding="utf-8") as f_out:
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print("ok ->", json_path, "and", csv_path)


ok -> data/test20/metadata_test20_coor.json and data/test20/metadata_test20_coor.csv


In [34]:
import json
import csv
import time
from pathlib import Path
import osmnx as ox
from tqdm import tqdm

path_metadata = Path("data/test20/metadata_test20.json")
out_json = Path("data/test20/metadata_test20_coor.json")
out_csv = Path("data/test20/metadata_test20_coor.csv")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

geo_names = data.get("Nom géographique", [])
hamlets = data.get("trans_hamlet_uniformise", [])
edifices = data.get("Édifice", [])
monuments = data.get("trans_monument_uniformise", [])
cities = data.get("City", [])

n = len(geo_names)

coords = []
used_source = []

with tqdm(total=n, desc="Geocode") as pbar:
    for i in range(n):
        city = cities[i] if i < len(cities) else ""
        # ordre de précision décroissante
        candidates = [
            ("trans_monument_uniformise", monuments[i] if i < len(monuments) else ""),
            ("Édifice", edifices[i] if i < len(edifices) else ""),
            ("trans_hamlet_uniformise", hamlets[i] if i < len(hamlets) else ""),
            ("Nom géographique", geo_names[i] if i < len(geo_names) else ""),
        ]

        result = ""
        source = ""
        for label, value in candidates:
            if not value:
                continue
            if city:
                query = f"{value}, {city}, Vaucluse, France"
            else:
                query = f"{value}, Vaucluse, France"
            try:
                lat, lon = ox.geocode(query)
                result = f"{lat}, {lon}"
                source = label
                break
            except Exception:
                continue

        coords.append(result)
        used_source.append(source)
        pbar.update(1)
        time.sleep(1)

data["coordonnées"] = coords
data["coord_source"] = used_source

# Supprime latitude/longitude si présents
data.pop("latitude", None)
data.pop("longitude", None)

out_json.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

fields = [k for k in data.keys()]
with out_csv.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for i in range(n):
        row = []
        for k in fields:
            v = data.get(k, "")
            if isinstance(v, list):
                row.append(v[i] if i < len(v) else "")
            else:
                row.append(v)
        writer.writerow(row)

print("ok ->", out_json, "and", out_csv)


Geocode: 100%|██████████| 20/20 [00:20<00:00,  1.01s/it]

ok -> data/test20/metadata_test20_coor.json and data/test20/metadata_test20_coor.csv





In [35]:
import json
import csv
import time
from pathlib import Path
import osmnx as ox
from tqdm import tqdm

path_metadata = Path("data/test20/metadata_test20_enrichi.json")
out_json = Path("data/test20/metadata_test20_coor.json")
out_csv = Path("data/test20/metadata_test20_coor.csv")

data = json.loads(path_metadata.read_text(encoding="utf-8"))

geo_names = data.get("Nom géographique", [])
hamlets = data.get("trans_hamlet_uniformise", [])
edifices = data.get("Édifice", [])
monuments = data.get("trans_monument_uniformise", [])
cities = data.get("City", [])

n = len(geo_names)

coords = []
used_source = []

with tqdm(total=n, desc="Geocode") as pbar:
    for i in range(n):
        city = cities[i] if i < len(cities) else ""
        candidates = [
            ("trans_monument_uniformise", monuments[i] if i < len(monuments) else ""),
            ("Édifice", edifices[i] if i < len(edifices) else ""),
            ("trans_hamlet_uniformise", hamlets[i] if i < len(hamlets) else ""),
            ("Nom géographique", geo_names[i] if i < len(geo_names) else ""),
        ]

        result = ""
        source = ""
        for label, value in candidates:
            if not value:
                continue
            if city:
                query = f"{value}, {city}, Vaucluse, France"
            else:
                query = f"{value}, Vaucluse, France"
            try:
                lat, lon = ox.geocode(query)
                result = f"{lat}, {lon}"
                source = label
                break
            except Exception:
                continue

        coords.append(result)
        used_source.append(source)
        pbar.update(1)
        time.sleep(1)

data["coordonnées"] = coords
data["coord_source"] = used_source

# Supprime latitude/longitude si présents
data.pop("latitude", None)
data.pop("longitude", None)

out_json.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

fields = list(data.keys())
with out_csv.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for i in range(n):
        row = []
        for k in fields:
            v = data.get(k, "")
            if isinstance(v, list):
                row.append(v[i] if i < len(v) else "")
            else:
                row.append(v)
        writer.writerow(row)

print("ok ->", out_json, "and", out_csv)


Geocode: 100%|██████████| 20/20 [00:49<00:00,  2.46s/it]

ok -> data/test20/metadata_test20_coor.json and data/test20/metadata_test20_coor.csv



