In [2]:
!pip install pandas



In [1]:

import pandas as pd
import re


# Ruta al archivo 
archivo = "/Users/severinodonate/Downloads/planet_osm_line_202505042033.csv"


# Leer el CSV con el separador correcto
df = pd.read_csv(archivo, sep=";", quotechar='"')

# Mostrar las primeras filas
df.head()


Unnamed: 0,osm_id,access,addr:housename,addr:housenumber,addr:interpolation,admin_level,aerialway,aeroway,amenity,area,...,tunnel,water,waterway,wetland,width,wood,z_order,way_area,tags,way
0,73669102,,,,,,,,,,...,,,,,,,0,,"""geological""=>""palaeontological_site""",LINESTRING (-8489984.980579784 -1562464.423260...
1,-6553940,,,,,,,,,,...,,,,,,,0,,"""name:en""=>""Pachacámac Monumental Archaeologic...",LINESTRING (-8560383.19278652 -1372476.1649533...
2,1213183116,,,,,,,,,,...,,,,,,,0,,"""archaeological_site""=>""geoglyph""",LINESTRING (-7717552.9489016095 -7174193.14824...
3,1213183085,,,,,,,,,,...,,,,,,,0,,"""archaeological_site""=>""geoglyph""",LINESTRING (-7732935.899903656 -7170751.108254...
4,1213183086,,,,,,,,,,...,,,,,,,0,,"""archaeological_site""=>""geoglyph""",LINESTRING (-7732147.635457399 -7167582.551801...


In [2]:
import re

def parse_tags(tag_str):
    """
    Convierte un string de tags tipo 'key'=>"value" en un diccionario Python
    """
    if not isinstance(tag_str, str) or tag_str.strip() == "":
        return {}

    # Elimina comillas triples y extrae pares key=>value
    tag_pairs = re.findall(r'"([^"]+)"\s*=>\s*"([^"]*)"', tag_str)
    return {key: value for key, value in tag_pairs}

In [3]:
ejemplo = df['tags'].dropna().iloc[0]
print("Original:", ejemplo)
print("Parseado:", parse_tags(ejemplo))

Original: "geological"=>"palaeontological_site"
Parseado: {'geological': 'palaeontological_site'}


In [4]:
df['tags_dict'] = df['tags'].apply(parse_tags)

In [5]:
# Aplicar la función a toda la columna
df['tags_dict'] = df['tags'].apply(parse_tags)

# Mostrar solo las columnas 'tags' y 'tags_dict' para comparar
df[['tags', 'tags_dict']].head()

Unnamed: 0,tags,tags_dict
0,"""geological""=>""palaeontological_site""",{'geological': 'palaeontological_site'}
1,"""name:en""=>""Pachacámac Monumental Archaeologic...",{'name:en': 'Pachacámac Monumental Archaeologi...
2,"""archaeological_site""=>""geoglyph""",{'archaeological_site': 'geoglyph'}
3,"""archaeological_site""=>""geoglyph""",{'archaeological_site': 'geoglyph'}
4,"""archaeological_site""=>""geoglyph""",{'archaeological_site': 'geoglyph'}


In [6]:
# Expandir los diccionarios en columnas
tags_expandidos = pd.json_normalize(df['tags_dict'])

# Unir las nuevas columnas al dataframe original
df_expandido = pd.concat([df.drop(columns=['tags_dict']), tags_expandidos], axis=1)

# Mostrar las primeras filas del nuevo dataframe
df_expandido.head()

Unnamed: 0,osm_id,access,addr:housename,addr:housenumber,addr:interpolation,admin_level,aerialway,aeroway,amenity,area,...,addr:state,dog,fax,check_date,ref:CR:SINAC,iucn_level,border_type,protect_id,ref:nrhp,operator:wikipedia
0,73669102,,,,,,,,,,...,,,,,,,,,,
1,-6553940,,,,,,,,,,...,,,,,,,,,,
2,1213183116,,,,,,,,,,...,,,,,,,,,,
3,1213183085,,,,,,,,,,...,,,,,,,,,,
4,1213183086,,,,,,,,,,...,,,,,,,,,,


In [7]:
print("Número de columnas:", df_expandido.shape[1])

Número de columnas: 388


In [8]:
df_expandido.columns.tolist()

['osm_id',
 'access',
 'addr:housename',
 'addr:housenumber',
 'addr:interpolation',
 'admin_level',
 'aerialway',
 'aeroway',
 'amenity',
 'area',
 'barrier',
 'bicycle',
 'brand',
 'bridge',
 'boundary',
 'building',
 'construction',
 'covered',
 'culvert',
 'cutting',
 'denomination',
 'disused',
 'embankment',
 'foot',
 'generator:source',
 'harbour',
 'highway',
 'historic',
 'horse',
 'intermittent',
 'junction',
 'landuse',
 'layer',
 'leisure',
 'lock',
 'man_made',
 'military',
 'motorcar',
 'name',
 'natural',
 'office',
 'oneway',
 'operator',
 'place',
 'population',
 'power',
 'power_source',
 'public_transport',
 'railway',
 'ref',
 'religion',
 'route',
 'service',
 'shop',
 'sport',
 'surface',
 'toll',
 'tourism',
 'tower:type',
 'tracktype',
 'tunnel',
 'water',
 'waterway',
 'wetland',
 'width',
 'wood',
 'z_order',
 'way_area',
 'tags',
 'way',
 'geological',
 'name:en',
 'name:fr',
 'name:it',
 'name:pt',
 'website',
 'alt_name',
 'wikidata',
 'wikipedia',
 'wheelc

In [9]:
# Guardar el dataframe en un nuevo archivo CSV
df_expandido.to_csv("/Users/severinodonate/Downloads/planet_osm_lines_expandido.csv", index=False, sep=';', quoting=1)

In [10]:
# Número total de filas
total_filas = len(df_expandido)

# Crear resumen de cobertura por variable
resumen = (
    df_expandido.notna()
    .sum()
    .reset_index()
    .rename(columns={"index": "variable", 0: "valores"})
)

# Calcular porcentaje de cobertura
resumen["porcentaje"] = (resumen["valores"] / total_filas * 100).round(2)

# Ordenar por porcentaje descendente
resumen = resumen.sort_values(by="porcentaje", ascending=False).reset_index(drop=True)

resumen

Unnamed: 0,variable,valores,porcentaje
0,osm_id,6949,100.00
1,way,6949,100.00
2,z_order,6949,100.00
3,barrier,2937,42.27
4,historic,2325,33.46
...,...,...,...
383,way_area,0,0.00
384,building,0,0.00
385,wetland,0,0.00
386,water,0,0.00


In [11]:
resumen.to_csv("/Users/severinodonate/Downloads/resumen_variables_lines.csv", index=False)