# Importação de bibliotecas

In [3]:
import psycopg2
import polars as pl

# Leitura de dados

In [21]:
ITENS_PROVA_SCHEMA = {
    # -- Colunas Numéricas --
    "CO_POSICAO": pl.Int32,  # Tamanho 3 (mas podemos guardar em Int32)
    "CO_ITEM": pl.Int32,  # Tamanho 5
    "CO_HABILIDADE": pl.Int32,  # Tamanho 2
    "IN_ITEM_ABAN": pl.Int8,  # Tamanho 1 (0 ou 1)
    "NU_PARAM_A": pl.Float64,  # Tamanho 9 (pode ser decimal, então Float64)
    "NU_PARAM_B": pl.Float64,  # idem
    "NU_PARAM_C": pl.Float64,  # idem
    "CO_PROVA": pl.Int32,  # Tamanho 3
    "TP_LINGUA": pl.Int8,  # Tamanho 1 (0 ou 1)
    "IN_ITEM_ADAPTADO": pl.Int8,  # Tamanho 1 (0 ou 1)
    # -- Colunas Alfanuméricas --
    "SG_AREA": pl.Utf8,  # 2 caracteres (ex.: CH, CN, LC, MT)
    "TX_GABARITO": pl.Utf8,  # 1 caractere
    "TX_MOTIVO_ABAN": pl.Utf8,  # até 40 caracteres
    "TX_COR": pl.Utf8,  # até 7 caracteres (ex.: "AMARELA")
}

MICRODADOS_ENEM_SCHEMA = {
    # --- Numéricas (geralmente inteiros) ---
    "NU_INSCRICAO": pl.Int64,
    "NU_ANO": pl.Int16,
    "TP_FAIXA_ETARIA": pl.Int16,
    "TP_ESTADO_CIVIL": pl.Int8,
    "TP_COR_RACA": pl.Int8,
    "TP_NACIONALIDADE": pl.Int8,
    "TP_ST_CONCLUSAO": pl.Int8,
    "TP_ANO_CONCLUIU": pl.Int16,
    "TP_ESCOLA": pl.Int8,
    "TP_ENSINO": pl.Int8,
    "IN_TREINEIRO": pl.Int8,
    "CO_MUNICIPIO_ESC": pl.Int32,
    "CO_UF_ESC": pl.Int16,
    "TP_DEPENDENCIA_ADM_ESC": pl.Int8,
    "TP_LOCALIZACAO_ESC": pl.Int8,
    "TP_SIT_FUNC_ESC": pl.Int8,
    "CO_MUNICIPIO_PROVA": pl.Int32,
    # CO_UF_PROVA consta como Alfanumérica abaixo,
    # mas se for numérica, você pode usar pl.Int16 ou pl.Int8
    "TP_PRESENCA_CN": pl.Int8,
    "TP_PRESENCA_CH": pl.Int8,
    "TP_PRESENCA_LC": pl.Int8,
    "TP_PRESENCA_MT": pl.Int8,
    "CO_PROVA_CN": pl.Int32,
    "CO_PROVA_CH": pl.Int32,
    "CO_PROVA_LC": pl.Int32,
    "CO_PROVA_MT": pl.Int32,
    "TP_LINGUA": pl.Int8,
    "TP_STATUS_REDACAO": pl.Int8,
    # Notas (geralmente decimais)
    "NU_NOTA_CN": pl.Float64,
    "NU_NOTA_CH": pl.Float64,
    "NU_NOTA_LC": pl.Float64,
    "NU_NOTA_MT": pl.Float64,
    "NU_NOTA_COMP1": pl.Float64,
    "NU_NOTA_COMP2": pl.Float64,
    "NU_NOTA_COMP3": pl.Float64,
    "NU_NOTA_COMP4": pl.Float64,
    "NU_NOTA_COMP5": pl.Float64,
    "NU_NOTA_REDACAO": pl.Float64,
    # --- Alfanuméricas ---
    "TP_SEXO": pl.Utf8,
    "NO_MUNICIPIO_ESC": pl.Utf8,
    "SG_UF_ESC": pl.Utf8,
    "NO_MUNICIPIO_PROVA": pl.Utf8,
    "CO_UF_PROVA": pl.Utf8,  # segundo o dicionário está Alfanumérica
    "SG_UF_PROVA": pl.Utf8,
    "TX_RESPOSTAS_CN": pl.Utf8,
    "TX_RESPOSTAS_CH": pl.Utf8,
    "TX_RESPOSTAS_LC": pl.Utf8,
    "TX_RESPOSTAS_MT": pl.Utf8,
    "TX_GABARITO_CN": pl.Utf8,
    "TX_GABARITO_CH": pl.Utf8,
    "TX_GABARITO_LC": pl.Utf8,
    "TX_GABARITO_MT": pl.Utf8,
    # Questionário socioeconômico
    "Q001": pl.Utf8,
    "Q002": pl.Utf8,
    "Q003": pl.Utf8,
    "Q004": pl.Int8,
    "Q005": pl.Int8,
    "Q006": pl.Utf8,
    "Q007": pl.Utf8,
    "Q008": pl.Utf8,
    "Q009": pl.Utf8,
    "Q010": pl.Utf8,
    "Q011": pl.Utf8,
    "Q012": pl.Utf8,
    "Q013": pl.Utf8,
    "Q014": pl.Utf8,
    "Q015": pl.Utf8,
    "Q016": pl.Utf8,
    "Q017": pl.Utf8,
    "Q018": pl.Utf8,
    "Q019": pl.Utf8,
    "Q020": pl.Utf8,
    "Q021": pl.Utf8,
    "Q022": pl.Utf8,
    "Q023": pl.Utf8,
    "Q024": pl.Utf8,
    "Q025": pl.Utf8,
}

In [22]:
itens_prova = pl.read_csv(
    "data/DADOS/ITENS_PROVA_2023.csv",
    has_header=True,
    separator=";",
    null_values=[""],
    ignore_errors=True,
    schema_overrides=ITENS_PROVA_SCHEMA
)

micro_enem = pl.read_csv(
    'data/DADOS/MICRODADOS_ENEM_2023.csv',
    has_header=True,
    separator=";",
    null_values=[""],
    ignore_errors=True,
    schema_overrides=MICRODADOS_ENEM_SCHEMA
)

## Espiadas nos dados

### Itens de prova

In [23]:
itens_prova.head()

CO_POSICAO,SG_AREA,CO_ITEM,TX_GABARITO,CO_HABILIDADE,IN_ITEM_ABAN,TX_MOTIVO_ABAN,NU_PARAM_A,NU_PARAM_B,NU_PARAM_C,TX_COR,CO_PROVA,TP_LINGUA,IN_ITEM_ADAPTADO
i32,str,i32,str,i32,i8,str,f64,f64,f64,str,i32,i8,i8
21,"""LC""",141283,"""B""",14,0,,2.20125,0.82582,0.2119,"""AMARELA""",1286,,0
20,"""LC""",118144,"""C""",23,0,,2.43295,0.90091,0.13239,"""AMARELA""",1286,,0
11,"""LC""",140926,"""D""",26,0,,1.95105,-0.13372,0.19557,"""AMARELA""",1282,,0
12,"""LC""",36864,"""A""",12,0,,1.14733,0.38901,0.23027,"""ROSA""",1243,,0
5,"""LC""",140614,"""C""",8,0,,2.25894,1.00728,0.17726,"""AMARELA""",1286,0.0,0


In [24]:
itens_prova.describe()

statistic,CO_POSICAO,SG_AREA,CO_ITEM,TX_GABARITO,CO_HABILIDADE,IN_ITEM_ABAN,TX_MOTIVO_ABAN,NU_PARAM_A,NU_PARAM_B,NU_PARAM_C,TX_COR,CO_PROVA,TP_LINGUA,IN_ITEM_ADAPTADO
str,f64,str,f64,str,f64,f64,str,f64,f64,f64,str,f64,f64,f64
"""count""",5550.0,"""5550""",5550.0,"""5550""",5550.0,5550.0,"""0""",5526.0,5526.0,5526.0,"""5550""",5550.0,300.0,5550.0
"""null_count""",0.0,"""0""",0.0,"""0""",0.0,0.0,"""5550""",24.0,24.0,24.0,"""0""",0.0,5250.0,0.0
"""mean""",88.135135,,102106.838378,,14.978378,0.004324,,2.199091,1.096806,0.180877,,1253.072072,0.5,0.1
"""std""",53.186867,,45758.967585,,8.666,0.065623,,0.765569,0.889084,0.061941,,37.320439,0.500835,0.300027
"""min""",1.0,"""CH""",6954.0,"""A""",1.0,0.0,,0.71667,-1.14947,0.00556,"""AMARELA""",1191.0,0.0,0.0
"""25%""",42.0,,66330.0,,8.0,0.0,,1.63551,0.38977,0.14759,,1220.0,0.0,0.0
"""50%""",88.0,,125687.0,,15.0,0.0,,2.11251,0.99995,0.185,,1252.0,1.0,0.0
"""75%""",134.0,,140878.0,,22.0,0.0,,2.69465,1.75696,0.21091,,1285.0,1.0,0.0
"""max""",180.0,"""MT""",141814.0,"""X""",30.0,1.0,,5.9367,3.22024,0.49995,"""VERDE""",1318.0,1.0,1.0


In [27]:
itens_prova.shape

(5550, 14)

### Microdados Enem

In [28]:
micro_enem.head()

NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,IN_TREINEIRO,CO_MUNICIPIO_ESC,NO_MUNICIPIO_ESC,CO_UF_ESC,SG_UF_ESC,TP_DEPENDENCIA_ADM_ESC,TP_LOCALIZACAO_ESC,TP_SIT_FUNC_ESC,CO_MUNICIPIO_PROVA,NO_MUNICIPIO_PROVA,CO_UF_PROVA,SG_UF_PROVA,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_PRESENCA_MT,CO_PROVA_CN,CO_PROVA_CH,CO_PROVA_LC,CO_PROVA_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,TX_RESPOSTAS_CN,TX_RESPOSTAS_CH,…,TP_LINGUA,TX_GABARITO_CN,TX_GABARITO_CH,TX_GABARITO_LC,TX_GABARITO_MT,TP_STATUS_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,Q001,Q002,Q003,Q004,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
i64,i16,i16,str,i8,i8,i8,i8,i16,i8,i8,i8,i32,str,i16,str,i8,i8,i8,i32,str,str,str,i8,i8,i8,i8,i32,i32,i32,i32,f64,f64,f64,f64,str,str,…,i8,str,str,str,str,i8,f64,f64,f64,f64,f64,f64,str,str,str,i8,i8,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
210059085136,2023,14,"""M""",2,1,1,1,17,1,,0,,,,,,,,5300108,,"""53""","""DF""",0,0,0,0,,,,,,,,,,,…,0,,,,,,,,,,,,"""A""","""F""","""E""",,5,"""F""","""C""","""C""","""D""","""C""","""D""","""C""","""B""","""B""","""D""","""C""","""C""","""B""","""B""","""A""","""B""","""B""","""A""","""A""","""B"""
210059527735,2023,12,"""M""",2,1,0,1,16,1,,0,,,,,,,,5300108,,"""53""","""DF""",0,0,0,0,,,,,,,,,,,…,0,,,,,,,,,,,,"""F""","""E""","""E""",,3,"""H""","""A""","""B""","""C""","""C""","""A""","""B""","""B""","""B""","""A""","""B""","""A""","""B""","""B""","""A""","""A""","""C""","""A""","""D""","""B"""
210061103945,2023,6,"""F""",1,1,1,1,0,1,,0,,,,,,,,4305108,"""Caxias do Sul""","""43""","""RS""",1,1,1,1,1221.0,1193.0,1204.0,1211.0,502.0,498.9,475.6,363.2,"""DBEBDCECCBCEBBBBDBABDDBBAABCBA…","""ABDEADAADCDABDCADAEABCDDCBAADC…",…,1,"""DBEABDABDCACDBECDDDBCAAABBACCC…","""ACEEABAADCDAADEABCDABCDCABCBDA…","""DBABBAEBAAAACDACDEDAACADBADBCC…","""BCCDEEABCBEDCEABBEBDABDDADDADE…",1.0,140.0,200.0,100.0,120.0,140.0,700.0,"""H""","""E""","""C""",,5,"""C""","""A""","""B""","""D""","""B""","""A""","""B""","""A""","""B""","""A""","""B""","""A""","""A""","""B""","""A""","""A""","""A""","""A""","""A""","""B"""
210060214087,2023,2,"""F""",1,3,1,2,0,2,1.0,0,2304400.0,"""Fortaleza""",23.0,"""CE""",2.0,1.0,1.0,2304400,"""Fortaleza""","""23""","""CE""",1,1,1,1,1224.0,1192.0,1202.0,1214.0,459.0,508.5,507.2,466.7,"""DEEBEACCCEBDDBDCCCAEEDCBAAADBC…","""DDAAEEBCCDEADBCDDCBAECABEBDEBD…",…,0,"""CDDDABBABDBEABDECCEEEDCEDAEBAB…","""DBAADEADCDCABABCDDEBAEABAECABA…","""BBBDAABAEACCEEEDEACBCACAACAACA…","""EBDADDAEBEACBEDCECCBEABCADEBCC…",1.0,140.0,200.0,160.0,180.0,200.0,880.0,"""D""","""D""","""B""",,5,"""C""","""A""","""B""","""B""","""A""","""A""","""B""","""A""","""A""","""A""","""A""","""A""","""A""","""B""","""A""","""A""","""D""","""A""","""A""","""B"""
210059980948,2023,3,"""F""",1,3,1,2,0,2,1.0,0,2311306.0,,23.0,"""CE""",2.0,1.0,1.0,2311306,,"""23""","""CE""",1,1,1,1,1222.0,1191.0,1201.0,1212.0,402.5,379.2,446.9,338.3,"""AECCEAACDEABEEECDBAEEAAADDEABC…","""CADEBCEDDEBCBAEBADDCECACADBDEB…",…,0,"""CAAADCCCCDDDABDCACDBEEEDCEDAEE…","""CDAEECABAACEAADECBDAABCDCABADC…","""BBDABAAEBADACEEDCCDBADBDEDCCEB…","""DCECACCBDECBEEABEABDDAADDABBBC…",1.0,120.0,120.0,120.0,120.0,80.0,560.0,"""B""","""B""","""A""",,4,"""B""","""A""","""B""","""A""","""A""","""A""","""B""","""A""","""A""","""A""","""A""","""A""","""A""","""B""","""A""","""A""","""B""","""A""","""A""","""A"""


In [29]:
micro_enem.describe()

statistic,NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,IN_TREINEIRO,CO_MUNICIPIO_ESC,NO_MUNICIPIO_ESC,CO_UF_ESC,SG_UF_ESC,TP_DEPENDENCIA_ADM_ESC,TP_LOCALIZACAO_ESC,TP_SIT_FUNC_ESC,CO_MUNICIPIO_PROVA,NO_MUNICIPIO_PROVA,CO_UF_PROVA,SG_UF_PROVA,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_PRESENCA_MT,CO_PROVA_CN,CO_PROVA_CH,CO_PROVA_LC,CO_PROVA_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,TX_RESPOSTAS_CN,…,TP_LINGUA,TX_GABARITO_CN,TX_GABARITO_CH,TX_GABARITO_LC,TX_GABARITO_MT,TP_STATUS_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,Q001,Q002,Q003,Q004,Q005,Q006,Q007,Q008,Q009,Q010,Q011,Q012,Q013,Q014,Q015,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,…,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""count""",3933955.0,3933955.0,3933955.0,"""3933955""",3933955.0,3933955.0,3933955.0,3933955.0,3933955.0,3933955.0,1339081.0,3933955.0,958506.0,"""564552""",958506.0,"""958506""",958506.0,958506.0,958506.0,3933955.0,"""2343157""","""3933955""","""3933955""",3933955.0,3933955.0,3933955.0,3933955.0,2692427.0,2822643.0,2822643.0,2692427.0,2692427.0,2822643.0,2822643.0,2692427.0,"""2692427""",…,3933955.0,"""2692427""","""2822643""","""2822643""","""2692427""",2822643.0,2822643.0,2822643.0,2822643.0,2822643.0,2822643.0,2822643.0,"""3933955""","""3933955""","""3933955""",0.0,3933955.0,"""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955""","""3933955"""
"""null_count""",0.0,0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,2594874.0,0.0,2975449.0,"""3369403""",2975449.0,"""2975449""",2975449.0,2975449.0,2975449.0,0.0,"""1590798""","""0""","""0""",0.0,0.0,0.0,0.0,1241528.0,1111312.0,1111312.0,1241528.0,1241528.0,1111312.0,1111312.0,1241528.0,"""1241528""",…,0.0,"""1241528""","""1111312""","""1111312""","""1241528""",1111312.0,1111312.0,1111312.0,1111312.0,1111312.0,1111312.0,1111312.0,"""0""","""0""","""0""",3933955.0,0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0"""
"""mean""",210060000000.0,2023.0,5.096262,,1.043938,2.0582,1.028277,1.684697,2.445029,1.41581,1.005142,0.157619,3191200.0,,31.764971,,2.437952,1.037193,1.016525,3081900.0,,,,0.685532,0.719845,0.719845,0.685532,1222.664775,1192.657075,1202.655082,1212.664779,495.751547,523.354206,518.146637,533.8357,,…,0.457009,,,,,1.152402,121.545183,139.295228,118.509046,129.797016,108.651997,617.798482,,,,,3.688928,,,,,,,,,,,,,,,,,,,,
"""std""",1154500.0,0.0,3.871855,,0.412003,1.003314,0.212776,0.746792,4.450337,0.601823,0.071525,0.364384,1020300.0,,10.176427,,0.909037,0.189235,0.20397,999346.029034,,,,0.465513,0.45167,0.45167,0.465513,3.691471,3.591161,3.590874,3.691568,87.928725,88.571061,75.452427,131.647774,,…,0.498148,,,,,0.800145,35.654437,51.615625,43.408285,43.773256,61.602483,214.621071,,,,,1.409879,,,,,,,,,,,,,,,,,,,,
"""min""",210060000000.0,2023.0,1.0,"""F""",0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1100015.0,"""Abadia dos Dourados""",11.0,"""AC""",1.0,1.0,1.0,1100015.0,"""Abaetetuba""","""11""","""AC""",0.0,0.0,0.0,0.0,1221.0,1191.0,1201.0,1211.0,0.0,0.0,0.0,0.0,"""******************************…",…,0.0,"""ACCDAEACBACCCEEADCDEDDCCBEBEBB…","""ABCDCBDACDAEACEECABADBEABADEBA…","""AEDECECDEECDDEEBAEBECDEDCBDBBD…","""BADDDEADDACBEDCECCCEBDDBEDBAAC…",1.0,0.0,0.0,0.0,0.0,0.0,0.0,"""A""","""A""","""A""",,1.0,"""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A""","""A"""
"""25%""",210060000000.0,2023.0,2.0,,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,2401008.0,,24.0,,2.0,1.0,1.0,2403251.0,,,,0.0,0.0,0.0,0.0,1222.0,1192.0,1202.0,1212.0,440.5,467.8,471.4,431.2,,…,0.0,,,,,1.0,120.0,120.0,100.0,120.0,60.0,500.0,,,,,3.0,,,,,,,,,,,,,,,,,,,,
"""50%""",210060000000.0,2023.0,3.0,,1.0,2.0,1.0,2.0,0.0,1.0,1.0,0.0,3205002.0,,32.0,,2.0,1.0,1.0,3107307.0,,,,1.0,1.0,1.0,1.0,1223.0,1193.0,1203.0,1213.0,493.9,530.4,523.1,523.6,,…,0.0,,,,,1.0,120.0,140.0,120.0,120.0,120.0,620.0,,,,,4.0,,,,,,,,,,,,,,,,,,,,
"""75%""",210060000000.0,2023.0,7.0,,1.0,3.0,1.0,2.0,3.0,2.0,1.0,0.0,3550308.0,,35.0,,2.0,1.0,1.0,3548906.0,,,,1.0,1.0,1.0,1.0,1224.0,1194.0,1204.0,1214.0,551.2,584.9,570.3,630.1,,…,1.0,,,,,1.0,140.0,180.0,140.0,160.0,160.0,780.0,,,,,4.0,,,,,,,,,,,,,,,,,,,,
"""max""",210060000000.0,2023.0,20.0,"""M""",4.0,5.0,4.0,4.0,17.0,3.0,2.0,1.0,5300108.0,"""Zacarias""",53.0,"""TO""",4.0,2.0,4.0,5300108.0,"""Xique-Xique""","""53""","""TO""",2.0,2.0,2.0,2.0,1304.0,1274.0,1284.0,1294.0,868.4,823.0,820.8,958.6,"""EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE…",…,1.0,"""DBEABDABDCACDBECDDDBCAAABBACCC…","""DEBCBBDABCECCBACEACABAADCDABAD…","""ECAEDDEEECAADEEDDCDACBABAACBAD…","""EBDEEBDADDABCCBCCABBADDBDDAEAE…",9.0,200.0,200.0,200.0,200.0,200.0,1000.0,"""H""","""H""","""F""",,20.0,"""Q""","""D""","""E""","""E""","""E""","""E""","""E""","""E""","""E""","""E""","""E""","""E""","""B""","""E""","""B""","""B""","""E""","""B""","""E""","""B"""


In [30]:
micro_enem.shape

(3933955, 76)