In [1]:
import os
import polars as pl
import hashlib
from unidecode import unidecode

In [2]:
URL_2023 = 'https://servicos.dpf.gov.br/dadosabertos/SINARM_CSV/OCORRENCIAS/OCORRENCIAS_ate_2023.csv'
URL_2024 = 'https://servicos.dpf.gov.br/dadosabertos/SINARM_CSV/OCORRENCIAS/OCORRENCIAS_2024.csv'
CONN_URL = f'postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:5432/{os.environ['PGDATABASE']}'

In [25]:
# df = pl.read_csv(URL_2023, separator=',', encoding='ISO-8859-1', schema_overrides=schema)
df  = pl.read_csv(URL_2024, separator=';', encoding='ISO-8859-1')

In [26]:
df = df.with_columns(
        pl.col('MUNICIPIO').map_elements(lambda x: unidecode(x), return_dtype=pl.String),
        pl.col('TIPO_OCORRENCIA').map_elements(lambda x: unidecode(x), return_dtype=pl.String).str.strip_chars(),
        pl.col('CALIBRE_ARMA').str.strip_chars(),
        pl.col('ANO_OCORRENCIA').cast(pl.Int32),
        pl.col('MES_OCORRENCIA').cast(pl.Int32),
        pl.col('TOTAL').cast(pl.Int32),
        pl.when( pl.col('MAIS_1000_MIL_HAB') == 'S').then(1).otherwise(0).alias('MAIS_1000_MIL_HAB')
)
df = df.rename({col: col.lower() for col in df.columns})

In [5]:
df.write_database('raw.ocorrencias',CONN_URL, if_table_exists='append', engine="adbc")

5026

In [31]:
df_uf = df.with_columns([
    pl.concat_str(['uf']).hash().cast(pl.String).str.slice(0,10).cast(pl.Int64).alias('id_uf')
    ,pl.col('uf').alias('desc_uf')]
).select(pl.col('id_uf'), pl.col('desc_uf')).unique().sort(by='desc_uf')

df_uf.write_database('refined.dim_uf',CONN_URL, if_table_exists='append', engine="sqlalchemy")
df_uf.head()

id_uf,desc_uf
i64,str
1467947522,""" """
1205062640,"""AC"""
1612019898,"""AL"""
1827415075,"""AM"""
4824591479,"""AP"""


In [33]:
df_region = df.with_columns([
    pl.concat_str(['municipio']).hash().cast(pl.String).str.slice(0,10).cast(pl.Int64).alias('id_municipio')
    ,pl.col('municipio').alias('desc_municipio')]
).select(pl.col('id_municipio'), pl.col('desc_municipio')).unique().sort(by='desc_municipio')

df_region.write_database('refined.dim_municipio',CONN_URL, if_table_exists='append', engine="sqlalchemy")
df_region.head()

id_municipio,desc_municipio
i64,str
8020685699,""" …"
7033997000,"""ABADIANIA …"
1302237676,"""ABAETETUBA …"
1534278933,"""ACAUA …"
8891096194,"""ACOPIARA …"


In [34]:
df_occurrence = df.with_columns([
    pl.concat_str(['tipo_ocorrencia']).hash().cast(pl.String).str.slice(0,10).cast(pl.Int64).alias('id_ocorrencia')
    ,pl.col('tipo_ocorrencia').alias('desc_ocorrencia')]
).select(pl.col('id_ocorrencia'), pl.col('desc_ocorrencia')).unique().sort(by='desc_ocorrencia')

df_occurrence.write_database('refined.dim_ocorrencia',CONN_URL, if_table_exists='append', engine="sqlalchemy")
df_occurrence.head()

id_ocorrencia,desc_ocorrencia
i64,str
6808513444,"""Alteracao da Arma de Porte"""
7004520911,"""Apostilada no Exercito"""
3711152557,"""Apreensao de Arma de Fogo"""
1886445689,"""Arrecadacao"""
5182811986,"""Campanha do Desarmamento"""


In [35]:
df_weapons_brand = df.with_columns([
    pl.concat_str(['marca_arma']).hash().cast(pl.String).str.slice(0,10).cast(pl.Int64).alias('id_marca')
    ,pl.col('marca_arma').alias('desc_marca')]
).select(pl.col('id_marca'), pl.col('desc_marca')).unique().sort(by='desc_marca')

df_weapons_brand.write_database('refined.dim_marca',CONN_URL, if_table_exists='append', engine="sqlalchemy")
df_weapons_brand.head()

id_marca,desc_marca
i64,str
2620821601,"""A ESPINGARDA MINEIRA (EDMUNDO …"
9082611769,"""A.N.C. …"
9678061032,"""A.Y.A. (AGUIRRE Y ARANZABAL) …"
1304885033,"""ACIER FIN …"
1482911640,"""ALFA-PROJ (SPOL S.R.O.) …"


In [36]:
df_weapons = df.with_columns([
    pl.concat_str(['especie_arma']).hash().cast(pl.String).str.slice(0,10).cast(pl.Int64).alias('id_arma')
    ,pl.col('especie_arma').alias('desc_arma')]
).select(pl.col('id_arma'), pl.col('desc_arma')).unique().sort(by='desc_arma')

df_weapons.write_database('refined.dim_arma',CONN_URL, if_table_exists='append', engine="sqlalchemy")
df_weapons.head()

id_arma,desc_arma
i64,str
1108337642,"""Carabina …"
5334210402,"""Carabina/Fuzil …"
6829860256,"""Carabina/cartucheira …"
7879237492,"""Carabina/espingarda …"
1747749307,"""Espingarda …"


In [37]:
df_weapons_caliber = df.with_columns([
    pl.concat_str(['calibre_arma']).hash().cast(pl.String).str.slice(0,10).cast(pl.Int64).alias('id_calibre')
    ,pl.col('calibre_arma').alias('desc_calibre')]
).select(pl.col('id_calibre'), pl.col('desc_calibre')).unique().sort(by='desc_calibre')

df_weapons_caliber.write_database('refined.dim_calibre',CONN_URL, if_table_exists='append', engine="sqlalchemy")
df_weapons_caliber.head()

id_calibre,desc_calibre
i64,str
3709912703,""".17"""
1355815213,""".22"""
9214006505,""".22 LR"""
9380248335,""".22 MAGNUM"""
5759397247,""".28GA"""


In [38]:
df_final = (
    df.join(df_uf, left_on='uf', right_on='desc_uf', how='inner' )
    .join(df_region, left_on='municipio', right_on='desc_municipio', how='inner' )
    .join(df_occurrence, left_on='tipo_ocorrencia', right_on='desc_ocorrencia', how='inner' )
    .join(df_weapons_brand, left_on='marca_arma', right_on='desc_marca', how='inner' )
    .join(df_weapons, left_on='especie_arma', right_on='desc_arma', how='inner')
    .join(df_weapons_caliber, left_on='calibre_arma', right_on='desc_calibre', how='inner')
)

df_final = df_final.select(
    pl.concat_str(['ano_ocorrencia','mes_ocorrencia','id_uf','id_municipio','id_ocorrencia','id_marca','id_arma','id_calibre','mais_1000_mil_hab','total']).hash().cast(pl.String).str.slice(0,10).cast(pl.Int64).alias('id')
    , pl.col('ano_ocorrencia')
    , pl.col('mes_ocorrencia')
    , pl.col('id_uf')
    , pl.col('id_municipio')
    , pl.col('id_ocorrencia')
    , pl.col('id_marca')
    , pl.col('id_arma')
    , pl.col('id_calibre')
    , pl.col('mais_1000_mil_hab')
    , pl.col('total')
)

df_final.write_database('refined.fat_ocorrencias',CONN_URL, if_table_exists='append', engine="sqlalchemy")
df_final.head()

id,ano_ocorrencia,mes_ocorrencia,id_uf,id_municipio,id_ocorrencia,id_marca,id_arma,id_calibre,mais_1000_mil_hab,total
i64,i32,i32,i64,i64,i64,i64,i64,i64,i32,i32
9909069664,2024,1,1205062640,7934730292,1191636506,5513301844,1747749307,3528128019,0,1
6854729849,2024,1,1205062640,7934730292,1191636506,8018495139,1385204371,2764471369,0,1
8311616951,2024,1,1205062640,4132942482,1191636506,8018495139,1385204371,1482489079,1,1
9967852303,2024,1,1205062640,4132942482,7004520911,8018495139,1385204371,1811035854,1,1
1504489775,2024,1,1205062640,4132942482,1225393204,8018495139,1385204371,1390595470,1,1
