In [1]:
import geopandas as gpd
import pandas as pd

from prompt2map.providers.openai import OpenAIProvider
from prompt2map.providers.openai import generate_openai_embedding_request
import topojson as tp

import logging

logging.basicConfig(level=logging.INFO)

# Carta Administrativa Oficial de Portugal

Fonte: https://snig.dgterritorio.gov.pt/rndg/srv/por/catalog.search#/metadata/198497815bf647ecaa990c34c42e932e

In [None]:
!wget https://geo2.dgterritorio.gov.pt/caop/CAOP_Continente_2023-gpkg.zip -P raw

In [2]:
caop2023 = gpd.read_file('raw/CAOP_Continente_2023-gpkg.zip', layer="Cont_Freg_CAOP2023")

codigos_municipios = caop2023.DICOFRE.str[:4].drop_duplicates()
codigos_df = (caop2023[["DICOFRE", "Freguesia", "Municipio", "Distrito"]]
                .rename(columns={"DICOFRE": "DTMNFR21"})
                .drop_duplicates()
                .set_index("DTMNFR21"))

# Dados do Censo 2021

Fonte: https://mapas.ine.pt/download/filesGPG/2021Seccoes/

In [None]:
!wget https://mapas.ine.pt/download/filesGPG/2021Seccoes/C2021_SECCOES_CONT.zip -P raw

In [3]:
seccoes_censo = gpd.read_file("raw/C2021_SECCOES_CONT.zip!C2021_SECCOES_CONT.gpkg") 

seccoes_censo = seccoes_censo.drop(
    columns=['DT21', 'DTMN21', 'DTMNFRSEC21', 'NUTS1_15','NUTS2_15','NUTS3_15','N_SS','SECNUM21']
)

In [4]:
seccoes_censo.is_valid.value_counts()

True    9929
Name: count, dtype: int64

## Group by freguesia

In [6]:
grouped_freguesia = seccoes_censo.dissolve(by="DTMNFR21", aggfunc='sum')

In [7]:
grouped_freguesia.is_valid.value_counts()

True    2882
Name: count, dtype: int64

## Adicionar nomes oficiais

In [9]:
grouped_freguesia_names = grouped_freguesia.join(codigos_df)

## Simplificar geografia

In [10]:
topo = tp.Topology(grouped_freguesia_names, prequantize=False)
simplified_data = topo.toposimplify(100).to_gdf()

simplified_data.is_valid.value_counts()

True     2860
False      22
Name: count, dtype: int64

In [11]:
simplified_data.geometry = simplified_data.geometry.make_valid()

In [12]:
simplified_data.to_parquet("../data/censo2021pt/censo2021_freguesia_toposimplify100.parquet", index=Falsse)

# Descrições de variáveis

Fonte: https://mapas.ine.pt/download/index2021Seccoes.phtml

In [35]:
!wget https://mapas.ine.pt/download/C2021_FSINTESE_VARIAVEIS_SECCOES.csv -P raw

--2024-09-24 01:03:20--  https://mapas.ine.pt/download/C2021_FSINTESE_VARIAVEIS_SECCOES.csv
Resolviendo mapas.ine.pt (mapas.ine.pt)... 193.192.10.123
Conectando con mapas.ine.pt (mapas.ine.pt)[193.192.10.123]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 16048 (16K) [text/plain]
Grabando a: «raw/C2021_FSINTESE_VARIAVEIS_SECCOES.csv»


2024-09-24 01:03:20 (14.9 GB/s) - «raw/C2021_FSINTESE_VARIAVEIS_SECCOES.csv» guardado [16048/16048]



In [33]:
variables = pd.read_csv("raw/C2021_FSINTESE_VARIAVEIS_SECCOES.csv", sep=";", skiprows=1).rename(columns={"Variável ": "column", "Descritivo": "description"})

missing_variables = pd.DataFrame([
    {"column": "Shape_Area", "description": "Área em m2"},
    {"column": "Shape_Length", "description": "Perímetro em m"}
])
missing_variables = pd.concat([variables, missing_variables], ignore_index=True)
missing_variables.to_csv("../data/censo2021pt/variables.csv", index=False)

# Embeddings

In [None]:
all_embeddings = codigos_df.stack().reset_index(drop=True).drop_duplicates().dropna().sort_values().to_list()
len(all_embeddings)

In [5]:
gpt = OpenAIProvider()

In [15]:
requests = [generate_openai_embedding_request(i, text) for i, text in enumerate(all_embeddings, start=1) ]
request_id = gpt.send_batch_embedding(requests, "embedding_input.jsonl")

In [None]:
batch = gpt.get_batch(request_id)

In [55]:
output = gpt.get_batch_result(batch.output_file_id, "embeddings_output.jsonl")

In [78]:
embedding_df = pd.DataFrame(zip(all_embeddings, map(lambda r:r["response"]["body"]["data"][0]["embedding"], output)), columns=["text", "values"])
embedding_df.to_parquet('../data/censo2021pt/embeddings.parquet', index=False)

In [149]:
a = duckdb.sql("SELECT * FROM '/Users/jm/Code/mapgpt-mvp/data/censo2021pt/embeddings.parquet'").df()
a.to_parquet('../data/censo2021pt/embeddings.parquet.gz', index=False, compression="gzip")


In [150]:
c = duckdb.sql("SELECT * FROM '/Users/jm/Code/mapgpt-mvp/data/censo2021pt/embeddings.parquet.gz'").df()
c

Unnamed: 0,text,values
0,A dos Francos,"[0.008186937, 0.008542891, 0.010678614, 0.0022..."
1,A dos Negros,"[0.045769397, 0.009363075, -0.02193381, -0.006..."
2,Abade de Neiva,"[0.010747739, 0.0008664023, -0.006095992, 0.03..."
3,Abadim,"[0.046897903, 0.02299857, 0.027052013, 0.06404..."
4,Abambres,"[0.0416421, 0.019071035, 0.029681364, 0.014730..."
...,...,...
2863,Évora de Alcobaça,"[0.005184332, -0.02555724, 0.010730423, 0.0119..."
2864,Ílhavo,"[-0.034412332, 0.02194545, 0.009519422, 0.0100..."
2865,Ílhavo (São Salvador),"[-0.0064445063, 0.033996493, 0.0396308, 0.0033..."
2866,Ínsua,"[-0.022281582, 0.047120284, -0.02518498, 0.026..."


In [144]:
c = duckdb.sql("SELECT * FROM '/Users/jm/Code/mapgpt-mvp/data/censo2021pt/embeddings.parquet.gz/*/*.parquet'").df()
c

Unnamed: 0,text,values,starts_with
0,A dos Francos,"[0.008186937, 0.008542891, 0.010678614, 0.0022...",a
1,A dos Negros,"[0.045769397, 0.009363075, -0.02193381, -0.006...",a
2,Abade de Neiva,"[0.010747739, 0.0008664023, -0.006095992, 0.03...",a
3,Abadim,"[0.046897903, 0.02299857, 0.027052013, 0.06404...",a
4,Abambres,"[0.0416421, 0.019071035, 0.029681364, 0.014730...",a
...,...,...,...
250,Águeda,"[0.007848273, 0.011033175, -0.020034265, -0.00...",a
251,Álvaro,"[0.033496447, 0.021426976, -0.048082426, -0.00...",a
252,Árvore,"[0.011485596, 0.004209825, -0.022842469, 0.004...",a
253,Ázere,"[0.020033559, -0.0017797981, -0.07187126, 0.02...",a


In [126]:
x = pd.read_parquet('../data/censo2021pt/embeddings.parquet')
x.to_parquet('../data/censo2021pt/embeddings_gzip.parquet', index=False, compression='gzip')

In [119]:
y = pd.DataFrame(x["values"].tolist())
x = x.join(y).drop(columns=["values"])
x.to_parquet("test_v1.parquet", index=False)