In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import chardet
import sys, os
import missingno as msno

In [55]:
%reload_ext watermark
%watermark -a "Leonardo da Silva Neves"

Author: Leonardo da Silva Neves



In [58]:
# Carregando DataFrames das edições do censo da educação superior.
censo_16 = 'censo_2016.csv'
file_ies = 'censo_ies/MICRODADOS_CADASTRO_IES_2016.CSV'

In [59]:
pd.set_option('display.max_columns', 200)

In [60]:
pd.set_option('display.max_rows', 500)

In [61]:
pd.set_option('display.max_colwidth', 100)

In [62]:
# Abrindo o arquivo como tipo binário 'rb' - 'read binary'
with open(censo_16, 'rb') as f1:
    raw_data_16 = f1.read(1000000)

In [63]:
# Descobrindo a codificação do dataset com o método .detect() da biblioteca chardet.
result_23 = chardet.detect(raw_data_16)
cod_23 = result_23['encoding']
conf_23 = result_23['confidence']
print(f'codificação detectada: {cod_23} (confiança: {conf_23:.2f}%)')

codificação detectada: ISO-8859-1 (confiança: 0.73%)


In [64]:
df_16 = pd.read_csv(censo_16, sep=';', encoding=cod_23, low_memory=False, decimal=',')
df_16.shape

(92866, 194)

In [65]:
cols = ['CO_IES', 'NO_REGIAO_IES', 'CO_REGIAO_IES', 'NO_UF_IES', 'SG_UF_IES', 'CO_UF_IES', 'NO_MUNICIPIO_IES', 'CO_MUNICIPIO_IES', 'IN_CAPITAL_IES']

In [66]:
ies_16 = pd.read_csv(file_ies, sep=';', encoding=cod_23, low_memory=False, decimal=';', usecols=cols)
ies_16.shape

(2407, 9)

In [67]:
(df_16.isna().mean()*100).round(2)

NU_ANO_CENSO                     0.00
CO_IES                           0.00
NO_CURSO                         0.00
CO_CURSO                         0.00
TP_GRAU_ACADEMICO                0.36
IN_GRATUITO                      0.00
TP_MODALIDADE_ENSINO             0.00
TP_NIVEL_ACADEMICO               0.00
NO_CINE_AREA_GERAL               0.00
CO_CINE_AREA_GERAL               0.00
CO_CINE_AREA_ESPECIFICA          0.00
NO_CINE_AREA_ESPECIFICA          0.00
CO_CINE_AREA_DETALHADA           0.00
NO_CINE_AREA_DETALHADA           0.00
NO_REGIAO                        1.79
CO_REGIAO                        1.79
NO_UF                            1.79
SG_UF                            1.79
CO_UF                            1.79
NO_MUNICIPIO                     1.79
CO_MUNICIPIO                     1.79
IN_CAPITAL                       1.79
TP_DIMENSAO                      0.00
TP_ORGANIZACAO_ACADEMICA         0.00
TP_REDE                          0.00
TP_CATEGORIA_ADMINISTRATIVA      0.00
QT_CURSO    

In [68]:
# Selecionando apenas colunas com valores numéricos
colunas_quantidade = [col for col in df_16.columns if col.startswith('QT_')]
len(colunas_quantidade)

168

In [69]:
# Preechendo os valores nulos (NaN) para evitar erros nas operações matemáricas, usando 0 nas colunas de contagem
for col in colunas_quantidade:
    df_16[col] = df_16[col].fillna(0)

# Solução alternativa usando .loc
# df_11.loc[:,colunas_quantidade] = df_11[colunas_quantidade].fillna(0)

Preenchendo os valores ausentes da coluna **'TP_GRAU_ACADEMICO'**

In [70]:
# 1. vou criar uma série (mascara booleana) (True onde é nan)
mask_na = df_16['TP_GRAU_ACADEMICO'].isna()

In [71]:
# 2. Aplicar a mascara boleanda para selecionar apenas as linhas com valor True
g_acadna = df_16[mask_na]

Para começar a preencher os valores ausentes da coluna 'TP_GRAU_ACADEMICO', algumas decisões de análise precisarão ser tomadas.

Para reduzir o número de valores ausentes na coluna 'TP_NIVEL_ACADEMICO'

In [72]:
# 3. Verificando os valores únicos da variável 'TP_NIVEL_ACADEMICO'
g_acadna['TP_NIVEL_ACADEMICO'].unique()

array([2, 1], dtype=int64)

In [73]:
# 4. Criando uma mascara booleanda apenas com os cursos Sequencial de formação especifica.
mask_cs = g_acadna['TP_NIVEL_ACADEMICO'] == 2 # Sequencial de formação específica

In [74]:
# 5. Aplicando a mascara booleana para para selecionar apenas as linha com valor True da variável 'TP_NIVEL_ACADEMICO'
c_fesp = g_acadna[mask_cs]

In [75]:
c_fesp.shape

(77, 194)

In [76]:
# 6. Verificando as incidências de valores da variável 'NO_CINE_AREA_GERAL'
c_fesp['NO_CINE_AREA_GERAL'].value_counts()

NO_CINE_AREA_GERAL
Negócios, administração e direito                             53
Computação e Tecnologias da Informação e Comunicação (TIC)     7
Serviços                                                       6
Artes e humanidades                                            6
Engenharia, produção e construção                              3
Saúde e bem-estar                                              2
Name: count, dtype: int64

In [77]:
# 7. Criando uma cópia do DataFrame para realizar as imputações
df_2016 = df_16.copy()

In [78]:
# 7.1 Checando as dimensões do novo DF
df_2016.shape

(92866, 194)

In [79]:
# 7.2 Verificando os valores ausentes na variável alvo.
df_2016['TP_GRAU_ACADEMICO'].isna().sum()

332

In [80]:
# 7.3 Criando uma validação condicional utilizando as duas mascaras criadas anteriormente e atribuindo um novo valor para a variável alvo.
condition = (mask_na & mask_cs)
df_2016.loc[condition, 'TP_GRAU_ACADEMICO'] = 5 #  Sequencial de formação específica

In [81]:
# 7.4 Conferindo a redução no número valores ausentes na variável alvo
df_2016['TP_GRAU_ACADEMICO'].isna().sum()

255

In [82]:
# 8. Criando uma nova mascara de seleção dos valores ausentes remanecentes na variável 'TP_GRAU_ACADEMICO'
mask1_na = df_2016['TP_GRAU_ACADEMICO'].isna()

In [83]:
# 8.1 Criação de um novo df filtrado pela mascara criada anteriormente
g_acadna1 = df_2016[mask1_na]

In [84]:
# 8.2 Verificação da dimensão do df olhando apenas para a quantidade de linhas
g_acadna1.shape[0]

255

In [85]:
# 8.3 Verificando algumas amostras de linhas do DataFrame para verificar a insidência de algum padrão
g_acadna1.sample(5)

Unnamed: 0,NU_ANO_CENSO,CO_IES,NO_CURSO,CO_CURSO,TP_GRAU_ACADEMICO,IN_GRATUITO,TP_MODALIDADE_ENSINO,TP_NIVEL_ACADEMICO,NO_CINE_AREA_GERAL,CO_CINE_AREA_GERAL,CO_CINE_AREA_ESPECIFICA,NO_CINE_AREA_ESPECIFICA,CO_CINE_AREA_DETALHADA,NO_CINE_AREA_DETALHADA,NO_REGIAO,CO_REGIAO,NO_UF,SG_UF,CO_UF,NO_MUNICIPIO,CO_MUNICIPIO,IN_CAPITAL,TP_DIMENSAO,TP_ORGANIZACAO_ACADEMICA,TP_REDE,TP_CATEGORIA_ADMINISTRATIVA,QT_CURSO,QT_VG_TOTAL,QT_VG_TOTAL_DIURNO,QT_VG_TOTAL_NOTURNO,QT_VG_TOTAL_EAD,QT_VG_NOVA,QT_VG_PROC_SELETIVO,QT_VG_REMANESC,QT_VG_PROG_ESPECIAL,QT_INSCRITO_TOTAL,QT_INSCRITO_TOTAL_DIURNO,QT_INSCRITO_TOTAL_NOTURNO,QT_INSCRITO_TOTAL_EAD,QT_INSC_VG_NOVA,QT_INSC_PROC_SELETIVO,QT_INSC_VG_REMANESC,QT_INSC_VG_PROG_ESPECIAL,QT_ING,QT_ING_FEM,QT_ING_MASC,QT_ING_DIURNO,QT_ING_NOTURNO,QT_ING_VG_NOVA,QT_ING_VESTIBULAR,QT_ING_ENEM,QT_ING_AVALIACAO_SERIADA,QT_ING_SELECAO_SIMPLIFICA,QT_ING_EGR,QT_ING_OUTRO_TIPO_SELECAO,QT_ING_PROC_SELETIVO,QT_ING_VG_REMANESC,QT_ING_VG_PROG_ESPECIAL,QT_ING_OUTRA_FORMA,QT_ING_0_17,QT_ING_18_24,QT_ING_25_29,QT_ING_30_34,QT_ING_35_39,QT_ING_40_49,QT_ING_50_59,QT_ING_60_MAIS,QT_ING_BRANCA,QT_ING_PRETA,QT_ING_PARDA,QT_ING_AMARELA,QT_ING_INDIGENA,QT_ING_CORND,QT_MAT,QT_MAT_FEM,QT_MAT_MASC,QT_MAT_DIURNO,QT_MAT_NOTURNO,QT_MAT_0_17,QT_MAT_18_24,QT_MAT_25_29,QT_MAT_30_34,QT_MAT_35_39,QT_MAT_40_49,QT_MAT_50_59,QT_MAT_60_MAIS,QT_MAT_BRANCA,QT_MAT_PRETA,QT_MAT_PARDA,QT_MAT_AMARELA,QT_MAT_INDIGENA,QT_MAT_CORND,QT_CONC,QT_CONC_FEM,QT_CONC_MASC,QT_CONC_DIURNO,QT_CONC_NOTURNO,QT_CONC_0_17,QT_CONC_18_24,QT_CONC_25_29,QT_CONC_30_34,QT_CONC_35_39,QT_CONC_40_49,QT_CONC_50_59,QT_CONC_60_MAIS,QT_CONC_BRANCA,QT_CONC_PRETA,QT_CONC_PRETA.1,QT_CONC_AMARELA,QT_CONC_INDIGENA,QT_CONC_CORND,QT_ING_NACBRAS,QT_ING_NACESTRANG,QT_MAT_NACBRAS,QT_MAT_NACESTRANG,QT_CONC_NACBRAS,QT_CONC_NACESTRANG,QT_ALUNO_DEFICIENTE,QT_ING_DEFICIENTE,QT_MAT_DEFICIENTE,QT_CONC_DEFICIENTE,QT_ING_FINANC,QT_ING_FINANC_REEMB,QT_ING_FIES,QT_ING_RPFIES,QT_ING_FINANC_REEMB_OUTROS,QT_ING_FINANC_NREEMB,QT_ING_PROUNII,QT_ING_PROUNIP,QT_ING_NRPFIES,QT_ING_FINANC_NREEMB_OUTROS,QT_MAT_FINANC,QT_MAT_FINANC_REEMB,QT_MAT_FIES,QT_MAT_RPFIES,QT_MAT_FINANC_REEMB_OUTROS,QT_MAT_FINANC_NREEMB,QT_MAT_PROUNII,QT_MAT_PROUNIP,QT_MAT_NRPFIES,QT_MAT_FINANC_NREEMB_OUTROS,QT_CONC_FINANC,QT_CONC_FINANC_REEMB,QT_CONC_FIES,QT_CONC_RPFIES,QT_CONC_FINANC_REEMB_OUTROS,QT_CONC_FINANC_NREEMB,QT_CONC_PROUNII,QT_CONC_PROUNIP,QT_CONC_NRPFIES,QT_CONC_FINANC_NREEMB_OUTROS,QT_ING_RESERVA_VAGA,QT_ING_RVREDEPUBLICA,QT_ING_RVPDEF,QT_ING_RVSOCIAL_RF,QT_ING_RVOUTROS,QT_MAT_RESERVA_VAGA,QT_MAT_RVREDEPUBLICA,QT_MAT_RVPDEF,QT_MAT_RVSOCIAL_RF,QT_MAT_RVSOCIAL_RF.1,QT_CONC_RESERVA_VAGA,QT_CONC_RVREDEPUBLICA,QT_CONC_RVSOCIAL_RF,QT_CONC_RVOUTROS,QT_SIT_TRANCADA,QT_SIT_DESVINCULADO,QT_SIT_TRANSFERIDO,QT_SIT_FALECIDO,QT_ING_PROCESCPUBLICA,QT_ING_PROCESCPRIVADA,QT_ING_PROCNAOINFORMADA,QT_MAT_PROCESCPUBLICA,QT_MAT_PROCESCPRIVADA,QT_MAT_PROCNAOINFORMADA,QT_CONC_PROCESCPUBLICA,QT_CONC_PROCESCPRIVADA,QT_CONC_PROCNAOINFORMADA,QT_PARFOR,QT_ING_PARFOR,QT_MAT_PARFOR,QT_CONC_PARFOR,QT_APOIO_SOCIAL,QT_ING_APOIO_SOCIAL,QT_MAT_APOIO_SOCIAL,QT_CONC_APOIO_SOCIAL,QT_ATIV_EXTRACURRICULAR,QT_ING_ATIV_EXTRACURRICULAR,QT_MAT_ATIV_EXTRACURRICULAR,QT_CONC_ATIV_EXTRACURRICULAR,QT_MOB_ACADEMICA,QT_ING_MOB_ACADEMICA,QT_MAT_MOB_ACADEMICA,QT_CONC_MOB_ACADEMICA
24278,2016,571,ABI - FILOSOFIA,5001127,,1,1,1,Programas básicos,0,1,Programas básicos,11,Programas básicos,Sul,4.0,Paraná,PR,41.0,Curitiba,4106902.0,1.0,1,1,1,1,0.0,97.0,44.0,53.0,0.0,90.0,0.0,7.0,0.0,333.0,135.0,198.0,0.0,326.0,0.0,7.0,0.0,113.0,44.0,69.0,53.0,60.0,106.0,78.0,25.0,0.0,3.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,81.0,15.0,5.0,7.0,1.0,3.0,1.0,70.0,7.0,13.0,1.0,1.0,21.0,205.0,71.0,134.0,154.0,51.0,0.0,135.0,26.0,17.0,14.0,4.0,7.0,2.0,146.0,7.0,30.0,2.0,1.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,1.0,205.0,0.0,0.0,0.0,4.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.0,51.0,0.0,18.0,0.0,77.0,77.0,0.0,28.0,28.0,0.0,0.0,0.0,0.0,23.0,35.0,4.0,0.0,63.0,50.0,0.0,119.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,12.0,24.0,0.0,6.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0
15654,2016,56,ABI - LETRAS,5000739,,1,1,1,Programas básicos,0,1,Programas básicos,11,Programas básicos,Sudeste,3.0,São Paulo,SP,35.0,Araraquara,3503208.0,0.0,1,1,1,2,0.0,143.0,72.0,71.0,0.0,120.0,0.0,23.0,0.0,742.0,444.0,298.0,0.0,730.0,0.0,12.0,0.0,173.0,119.0,54.0,78.0,95.0,117.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,133.0,25.0,8.0,3.0,3.0,1.0,0.0,147.0,7.0,18.0,1.0,0.0,0.0,160.0,115.0,45.0,71.0,89.0,0.0,130.0,20.0,3.0,3.0,3.0,1.0,0.0,133.0,7.0,18.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173.0,0.0,160.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,28.0,0.0,0.0,0.0,46.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,22.0,115.0,0.0,74.0,99.0,0.0,66.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,14.0,15.0,0.0,57.0,27.0,26.0,0.0,1.0,0.0,0.0,0.0
24288,2016,571,ABI - QUÍMICA,5001140,,1,1,1,Programas básicos,0,1,Programas básicos,11,Programas básicos,Sul,4.0,Paraná,PR,41.0,Curitiba,4106902.0,1.0,1,1,1,1,0.0,81.0,81.0,0.0,0.0,66.0,0.0,15.0,0.0,282.0,282.0,0.0,0.0,267.0,0.0,15.0,0.0,84.0,49.0,35.0,84.0,0.0,69.0,48.0,21.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,3.0,73.0,4.0,2.0,2.0,0.0,0.0,0.0,56.0,4.0,16.0,1.0,1.0,6.0,204.0,128.0,76.0,204.0,0.0,3.0,184.0,8.0,6.0,2.0,0.0,1.0,0.0,150.0,7.0,30.0,7.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84.0,0.0,204.0,0.0,0.0,0.0,4.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,35.0,0.0,18.0,0.0,65.0,63.0,0.0,31.0,31.0,0.0,0.0,0.0,0.0,12.0,32.0,6.0,0.0,44.0,40.0,0.0,96.0,108.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,19.0,46.0,0.0,31.0,2.0,30.0,0.0,1.0,0.0,1.0,0.0
25251,2016,9,ABI - CIÊNCIAS SOCIAIS,5001312,,1,1,1,Programas básicos,0,1,Programas básicos,11,Programas básicos,Sul,4.0,Paraná,PR,41.0,Londrina,4113700.0,0.0,1,1,1,2,0.0,153.0,75.0,78.0,0.0,100.0,0.0,53.0,0.0,1006.0,436.0,570.0,0.0,1001.0,0.0,5.0,0.0,94.0,51.0,43.0,41.0,53.0,93.0,52.0,41.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,82.0,3.0,4.0,4.0,1.0,0.0,0.0,60.0,16.0,16.0,2.0,0.0,0.0,144.0,79.0,65.0,71.0,73.0,0.0,128.0,5.0,5.0,5.0,1.0,0.0,0.0,96.0,19.0,24.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,18.0,0.0,0.0,0.0,52.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,5.0,0.0,0.0,60.0,34.0,0.0,98.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,8.0,12.0,0.0,58.0,28.0,56.0,0.0,0.0,0.0,0.0,0.0
12763,2016,573,ABI - HISTÓRIA,5000636,,1,1,1,Programas básicos,0,1,Programas básicos,11,Programas básicos,Sudeste,3.0,Espírito Santo,ES,32.0,Vitória,3205309.0,1.0,1,1,1,1,0.0,88.0,48.0,40.0,0.0,80.0,0.0,8.0,0.0,470.0,263.0,207.0,0.0,406.0,0.0,64.0,0.0,88.0,32.0,56.0,48.0,40.0,79.0,79.0,79.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,66.0,11.0,4.0,1.0,4.0,2.0,0.0,46.0,9.0,31.0,0.0,0.0,2.0,201.0,78.0,123.0,162.0,39.0,0.0,157.0,23.0,6.0,4.0,5.0,4.0,2.0,100.0,18.0,76.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,0.0,201.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,8.0,0.0,8.0,12.0,91.0,13.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,2.0,16.0,31.0,0.0,43.0,45.0,0.0,89.0,92.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,20.0,63.0,0.0,20.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0


In [86]:
# 8.4 Analisando algumas estatísticas das variáveis não númericas do DataFrame
g_acadna1.describe(include = 'object')

Unnamed: 0,NO_CURSO,NO_CINE_AREA_GERAL,NO_CINE_AREA_ESPECIFICA,NO_CINE_AREA_DETALHADA,NO_REGIAO,NO_UF,SG_UF,NO_MUNICIPIO
count,255,255,255,255,255,255,255,255
unique,82,1,1,1,5,16,16,54
top,ABI - CIÊNCIAS BIOLÓGICAS,Programas básicos,Programas básicos,Programas básicos,Sudeste,São Paulo,SP,Brasília
freq,25,255,255,255,152,71,71,18


In [87]:
#  8.4.1 Verificando as regiões do Brasil que não preencheram a variável ´TP_GRAU_ACADEMICO´ para o cursos diferentes de ...
# 'Sequencial de formação especifica"
g_acadna1['NO_REGIAO'].value_counts()

NO_REGIAO
Sudeste         152
Sul              40
Centro-Oeste     34
Nordeste         23
Norte             6
Name: count, dtype: int64

In [88]:
# 8.4.2 Verificando insidência de valores ausentes por tipo de Rede (Pública e Privada)
g_acadna1['TP_REDE'].value_counts()

TP_REDE
1    253
2      2
Name: count, dtype: int64

In [89]:
# 8.4.3 Visualizando a ocorrência dos cursos para determinar quais são bacharelados por definição e quais não são.
g_acadna1['NO_CURSO'].value_counts()

NO_CURSO
ABI - CIÊNCIAS BIOLÓGICAS                                       25
ABI - CIÊNCIAS SOCIAIS                                          16
ABI - MATEMÁTICA                                                14
ABI - GEOGRAFIA                                                 14
ABI - QUÍMICA                                                   13
ABI - HISTÓRIA                                                  13
ABI - FÍSICA                                                    13
ABI - FILOSOFIA                                                 12
ABI - LETRAS                                                    12
ABI - ENGENHARIA                                                 6
ABI - ARTES VISUAIS                                              5
ABI - ARTES CÊNICAS                                              5
ABI - MÚSICA                                                     5
ABI - EDUCAÇÃO FÍSICA                                            4
ABI - CIÊNCIAS EXATAS                                

In [90]:
g_acadna1['NO_CURSO'].unique()

array(['ABI - LICENCIATURA INTERCULTURAL', 'ABI - CIÊNCIAS SOCIAIS',
       'ABI - FILOSOFIA', 'ABI - FÍSICA', 'ABI - LETRAS',
       'ABI - ENGENHARIA DE ENERGIAS E MEIO AMBIENTE', 'ABI - ENGENHARIA',
       'ABI - LICENCIATURAS INTERDISCIPLINARES',
       'ABI - CIÊNCIAS BIOLÓGICAS', 'ABI - COMPOSIÇÃO E REGÊNCIA',
       'ABI - DANÇA', 'ABI - ENGENHARIA DE MINAS', 'ABI - GEOGRAFIA',
       'ABI - HISTÓRIA', 'ABI - INSTRUMENTO', 'ABI - LETRAS VERNÁCULAS',
       'ABI - LETRAS VERNÁCULAS E LÍNGUA ESTRANGEIRA MODERNA',
       'ABI - LÍNGUA ESTRANGEIRA',
       'ABI - LÍNGUA ESTRANGEIRA MODERNA OU CLÁSSICA', 'ABI - MATEMÁTICA',
       'ABI - MÚSICA POPULAR', 'ABI - QUÍMICA', 'ABI - ARTES VISUAIS',
       'ABI - MÚSICA', 'ABI - TEATRO',
       'GEOGRAFIA - LICENCIATURA OU BACHARELADO',
       'HISTÓRIA - LICENCIATURA OU BACHARELADO', 'ABI - EDUCAÇÃO FÍSICA',
       'ABI - ENFERMAGEM',
       'CIÊNCIAS SOCIAIS - LICENCIATURA OU BACHARELADO',
       'EDUCAÇÃO FÍSICA - LICENCIATURA OU BACHAR

In [91]:
# 8.4.4 Criando uma mascara classificando os cursos de licenciatura e bacharelado
bacharelado = [
    'ABI - ENGENHARIA DE ENERGIAS E MEIO AMBIENTE',
    'ABI - ENGENHARIA',
    'ABI - ENGENHARIA DE MINAS',
    'ABI - ENGENHARIA MECÂNICA',
    'ABI - ENGENHARIA ELÉTRICA',
    'ABI - CICLO BÁSICO MATERIAIS/METALÚRGICA',
    'ABI - COMUNICAÇÃO SOCIAL',
    'ABI - DESIGN',
    'ABI - CIÊNCIAS BIOMÉDICAS',
    'ABI - ECONOMIA EMPRESARIAL E CONTROLADORIA',
    'ABI - MEDICINA',
    'ABI - PSICOLOGIA',
    'ABI - CIÊNCIA DA COMPUTAÇÃO',
    'ABI - ENFERMAGEM',
    'ENFERMAGEM - LICENCIATURA OU BACHARELADO'
]
len(bacharelado)

15

In [92]:
# 8.4.4.2 Criando a condição lógica para definir a atribuição de valores para as outras variáveis
lic_bac = [
    'ABI - LICENCIATURA INTERCULTURAL',
    'ABI - LICENCIATURAS INTERDISCIPLINARES',
    'ABI - CIÊNCIAS BIOLÓGICAS',
    'ABI - CIÊNCIAS SOCIAIS',
    'ABI - FILOSOFIA',
    'ABI - FÍSICA',
    'ABI - GEOGRAFIA',
    'ABI - HISTÓRIA',
    'ABI - MATEMÁTICA',
    'ABI - QUÍMICA',
    'ABI - LETRAS',
    'ABI - LETRAS VERNÁCULAS',
    'ABI - LETRAS VERNÁCULAS E LÍNGUA ESTRANGEIRA MODERNA',
    'ABI - LÍNGUA ESTRANGEIRA',
    'ABI - LÍNGUA ESTRANGEIRA MODERNA OU CLÁSSICA',
    'ABI - ARTES VISUAIS',
    'ABI - COMPOSIÇÃO E REGÊNCIA',
    'ABI - DANÇA',
    'ABI - INSTRUMENTO',
    'ABI - MÚSICA POPULAR',
    'ABI - MÚSICA',
    'ABI - TEATRO',
    'ABI - EDUCAÇÃO FÍSICA',
    'ABI - CIÊNCIAS DA NATUREZA',
    'ABI - CIÊNCIAS EXATAS',
    'ABI - ARTES CÊNICAS',
    'ABI - EDUCAÇÃO FÍSICA E ESPORTE',
    'ABI - MATEMÁTICA APLICADA',
    'ABI - MATEMÁTICA APLICADA E COMPUTACIONAL',
    'ABI - EDUCAÇÃO INTERCULTURAL',
    'ABI - ARTES PLÁSTICAS',
    'ABI - FÍSICA, MATEMÁTICA/MATEMÁTICA APLICADA E COMPUTACIONAL',
    'ABI - LETRAS PORTUGUÊS',
    'ABI - LETRAS FRANCÊS',
    'ABI - LETRAS INGLÊS',
    'ABI - LETRAS PORTUGUÊS E ESPANHOL',
    'ABI - LETRAS ALEMÂO',
    'ABI - LETRAS ESPANHOL',
    'ABI - LETRAS ITALIANO',
    'ABI - LETRAS LÍNGUA PORTUGUESA',
    'ABI - LETRAS - LÍNGUA PORTUGUESA',
    'ABI - LETRAS - PORTUGUÊS - ARABE',
    'ABI - LETRAS - PORTUGUÊS - ESPANHOL',
    'ABI - LETRAS - PORTUGUÊS - FRANCES',
    'ABI - LETRAS - PORTUGUÊS - GREGO',
    'ABI - LETRAS - PORTUGUÊS - HEBRAICO',
    'ABI - LETRAS - PORTUGUÊS - INGLES',
    'ABI - LETRAS - PORTUGUÊS - ITALIANO',
    'ABI - LETRAS - PORTUGUÊS - JAPONES',
    'ABI - LETRAS - PORTUGUÊS - LATIM',
    'ABI - LETRAS - PORTUGUÊS - RUSSO',
    'ABI - LETRAS PORTUGUÊS - ALEMÃO',
    'ABI - LITERATURAS DE LÍNGUA PORTUGUESA',
    'LETRAS - JAPONÊS',
    'LETRAS - POLONÊS',
    'LETRAS - PORTUGUÊS, E/OU ALEMÃO, E/OU GREGO, E/OU LATIM',
    'GEOGRAFIA - LICENCIATURA OU BACHARELADO',
    'HISTÓRIA - LICENCIATURA OU BACHARELADO',
    'CIÊNCIAS SOCIAIS - LICENCIATURA OU BACHARELADO',
    'EDUCAÇÃO FÍSICA - LICENCIATURA OU BACHARELADO',
    'FILOSOFIA - LICENCIATURA OU BACHARELADO',
    'MATEMÁTICA - LICENCIATURA OU BACHARELADO',
    'MÚSICA - LICENCIATURA OU BACHARELADO',
    'TEATRO - LICENCIATURA OU BACHARELADO',
    'CIÊNCIAS BIOLÓGICAS - LICENCIATURA OU BACHARELADO',
    'ECONOMIA DOMÉSTICA - LICENCIATURA OU BACHARELADO',
    'QUÍMICA - LICENCIATURA OU BACHARELADO'
]

In [93]:
# 8.4.4.1 Criando a condição lógica para definir a atribuição de valores
mask2 = df_2016['NO_CURSO'].isin(bacharelado)
mask3 = df_2016['TP_GRAU_ACADEMICO'].isna()
mask4 = df_2016['NO_CURSO'].isin(lic_bac)
condiction1 = (mask2 & mask3)
condiction2 = (mask4 & mask3)

In [94]:
# Conferindo número de atribuições da condiction1
condiction1.sum()

25

In [95]:
df_2016['TP_GRAU_ACADEMICO'].isna().sum()

255

In [96]:
# 8.4.4.3 Atribuição para os registros que atendem a condição de não licenciaturas
df_2016.loc[condiction1, 'TP_GRAU_ACADEMICO'] = 1 # Bacharelado

In [97]:
df_2016['TP_GRAU_ACADEMICO'].isna().sum()

230

In [98]:
# 8.4.4.4 Atribuição para os registros que atendem a condição de licenciaturas
df_2016.loc[condiction2, 'TP_GRAU_ACADEMICO'] = 4 # Bacharelado ou licenciaturas

In [99]:
# Verificando o número de atribuições da condiction2
condiction2.sum()

230

In [101]:
# 9. Verificação da incidência de valores nulos na variável 'TP_GRAU_ACADEMICO'
df_2016['TP_GRAU_ACADEMICO'].isna().sum()

0

In [102]:
df_tgau = df_2016['TP_GRAU_ACADEMICO'].isna()

**Tratamento das demais variáveis com valores ausentes**

['NO_REGIAO', 'CO_REGIAO', 'NO_UF', 'SG_UF', 'CO_UF', 'NO_MUNICIPIO', 'CO_MUNICIPIO', 'IN_CAPITAL']

In [103]:
(df_2016.isna().mean()*100).round(2)

NU_ANO_CENSO                    0.00
CO_IES                          0.00
NO_CURSO                        0.00
CO_CURSO                        0.00
TP_GRAU_ACADEMICO               0.00
IN_GRATUITO                     0.00
TP_MODALIDADE_ENSINO            0.00
TP_NIVEL_ACADEMICO              0.00
NO_CINE_AREA_GERAL              0.00
CO_CINE_AREA_GERAL              0.00
CO_CINE_AREA_ESPECIFICA         0.00
NO_CINE_AREA_ESPECIFICA         0.00
CO_CINE_AREA_DETALHADA          0.00
NO_CINE_AREA_DETALHADA          0.00
NO_REGIAO                       1.79
CO_REGIAO                       1.79
NO_UF                           1.79
SG_UF                           1.79
CO_UF                           1.79
NO_MUNICIPIO                    1.79
CO_MUNICIPIO                    1.79
IN_CAPITAL                      1.79
TP_DIMENSAO                     0.00
TP_ORGANIZACAO_ACADEMICA        0.00
TP_REDE                         0.00
TP_CATEGORIA_ADMINISTRATIVA     0.00
QT_CURSO                        0.00
Q

Aplicação da função .map() para otimização **'NO_REGIAO'**

In [104]:
# 1. Verificando as dimensões do df lookup (Capastro_ies - fonte de mapeamento)
ies_16.shape

(2407, 9)

In [105]:
# 1.2 Verificando se existe dados duplicados na variável chave do df lookup
ies_16['CO_IES'].duplicated().sum()

0

In [106]:
# 2. Criação da série de mapeamento (dicionário de lookup) index= CO_IES (chave) Values = 'NO_REGIAO'
map_cat1 = ies_16.set_index('CO_IES')['NO_REGIAO_IES']
# O método .drop_duplicates() é essencial para garantir que a chave (CO_IES) seja única, evitando que o .map() falhe ou retorne valores incorretos.
#map_cat1 = map_cat1.drop_duplicates() # no caso atual, não é necessário pois a os registro do df ies_10 são 1 para 1.

In [107]:
map_cat1.head()

CO_IES
1    Centro-Oeste
2    Centro-Oeste
3        Nordeste
4           Norte
5        Nordeste
Name: NO_REGIAO_IES, dtype: object

In [108]:
# Gerando uma nova serie com os valores do mapeamento
cat_map = df_2016['CO_IES'].map(map_cat1)

In [109]:
df_2016['NO_REGIAO'] = df_2016['NO_REGIAO'].fillna(cat_map)

In [110]:
df_2016['NO_REGIAO'].isna().sum()

0

Aplicando a função na variável **'CO_REGIAO'**

In [111]:
# 1. Criação da série de mapeamentos (lookup)
map_cat2 = ies_16.set_index('CO_IES')['CO_REGIAO_IES']

In [112]:
# 2. Gerando série com os valores do mapeamento
cat_map2 = df_2016['CO_IES'].map(map_cat2) 

In [113]:
# 3. Preechimento de valores com .fillna()
df_2016['CO_REGIAO'] = df_2016['CO_REGIAO'].fillna(cat_map2)

In [114]:
# 4. Validando a aplicação
df_2016['CO_REGIAO'].isna().sum()

0

Aplicando a função na variável **'NO_UF'**

In [115]:
# 1. Criação da série de mapeamento (lookup)
map_cat3 = ies_16.set_index('CO_IES')['NO_UF_IES']

In [116]:
# 2. Gerando série com os valores do mapeamento
cat_map3 = df_2016['CO_IES'].map(map_cat3)

In [117]:
# 3. Preenchimento de valores com .fillna()
df_2016['NO_UF'] = df_2016['NO_UF'].fillna(cat_map3)

In [118]:
# 4. Validação da aplicação
df_2016['NO_UF'].isna().sum()

0

Aplicando a função na variável **'SG_UF'**

In [119]:
# 1. Criação da série de mapeamento (lookup)
map_cat4 = ies_16.set_index('CO_IES')['SG_UF_IES']

In [120]:
# 2. Gerando série com os valores do mapeamento
cat_map4 = df_2016['CO_IES'].map(map_cat4)

In [121]:
# 3. Preenchimento de valores com .fillna()
df_2016['SG_UF'] = df_2016['SG_UF'].fillna(cat_map4)

In [122]:
# 4. Validação da aplicação
df_2016['SG_UF'].isna().sum()

0

Aplicando a função na variável **'CO_UF'**

In [123]:
df_2016.CO_UF.isna().sum()

1664

In [124]:
# 1. Criação de série de mapeamento (lookup)
map_cat5 = ies_16.set_index('CO_IES')['CO_UF_IES']

In [125]:
# 2. Gerando série com os valores do mapemento
cat_map5 = df_2016['CO_IES'].map(map_cat5)

In [127]:
# 3. Preechimento de valores com .fillna()
df_2016['CO_UF'] = df_2016['CO_UF'].fillna(cat_map5)

In [128]:
# 4. Validação da aplicação
df_2016.CO_UF.isna().sum()

0

Aplicando a função na variável **'NO_MUNICIPIO'**

In [129]:
df_2016.NO_MUNICIPIO.isna().sum()

1664

In [130]:
# 1. Criação de série de mapeamento (lookup)
map_cat6 = ies_16.set_index('CO_IES')['NO_MUNICIPIO_IES']

In [131]:
# 2. Gerando série com os valores do mapeamento
cat_map6 = df_2016['CO_IES'].map(map_cat6)

In [132]:
# 3. Preenchimento de valores com fillna()
df_2016['NO_MUNICIPIO'] = df_2016['NO_MUNICIPIO'].fillna(cat_map6)

In [133]:
# 4. Validação
df_2016['NO_MUNICIPIO'].isna().sum()

0

Aplicando a função na variável **'CO_MUNICIPIO'**

In [134]:
df_2016.CO_MUNICIPIO.isna().sum()

1664

In [135]:
# 1. Criação de série de mapeamento (lookup)
map_cat7 = ies_16.set_index('CO_IES')['CO_MUNICIPIO_IES']

In [136]:
# 2. Gerando série com os valores do mapeamento
cat_map7 = df_2016['CO_IES'].map(map_cat7)

In [137]:
# 3. Preenchimento de valores com fillna()
df_2016['CO_MUNICIPIO'] = df_2016['CO_MUNICIPIO'].fillna(cat_map7)

In [138]:
# 4. Validação
df_2016['CO_MUNICIPIO'].isna().sum()

0

Aplicando a função na variável **'CO_MUNICIPIO'**

In [139]:
df_2016.IN_CAPITAL.isna().sum()

1664

In [140]:
# 1. Criação de série de mapeamento (lookup)
map_cat8 = ies_16.set_index('CO_IES')['IN_CAPITAL_IES']

In [141]:
# 2. Gerando série com os valores do mapeamento
cat_map8 = df_2016['CO_IES'].map(map_cat8)

In [142]:
# 3. Preenchimento de valores com fillna()
df_2016['IN_CAPITAL'] = df_2016['IN_CAPITAL'].fillna(cat_map8)

In [90]:
# 4. Validação
df_2016['IN_CAPITAL'].isna().sum()

0

In [143]:
# Verificando demais variáveis com valores nulos
(df_2016.isna().mean()*100).round(2).head()

NU_ANO_CENSO         0.0
CO_IES               0.0
NO_CURSO             0.0
CO_CURSO             0.0
TP_GRAU_ACADEMICO    0.0
dtype: float64

#### Aplicação de imputação condicional para para preencher os valores ausentes da variável 'TP_REDE' 

In [92]:
# 1. Definindo os valores da variável que serão usadas para validação
#ies_publicas = [1, 2, 3] # TP_CATEGORIA_ADMINISTRATIVA (PÚBLICA)
# 2. Definindo a condição de nulidade da variável TP_REDE
#cond_na = df_2013['TP_REDE'].isna()
# 3. Definindo a condição da coluna secundária
#cond_sec = df_2013['TP_CATEGORIA_ADMINISTRATIVA'].isin(ies_publicas)
# 4. Combinando as duas condições (NA e [1, 2, 3])
#conditions = (cond_na & cond_sec)
# 5. Aplicando a alteração (imputação condicional)
#valor = 1
#df_2013.loc[conditions, 'TP_REDE'] = valor

In [93]:
# print(f" Aplicado em {conditions.sum()} registros")

In [96]:
# 1. Definindo  os valores da variável que será usada para validação
#ies_privadas = [4, 5, 6, 7, 8, 9]
# 2. Condição de nulidade
#cond_na2 = df_2013['TP_REDE'].isna()
# 3. Coluna segundária
#cond_sec2 = df_2013['TP_CATEGORIA_ADMINISTRATIVA'].isin(ies_privadas)
# 4. Combinando as duas condições
#condictions2 = (cond_na2 & cond_sec2)
# 5. Aplicar a Alteração
#valor2 = 2
#df_2013.loc[condictions2, 'TP_REDE'] = valor2

In [98]:
#print(f"Aplicado em {condictions2.sum()} registros")

Aplicado em 52291 registros


In [100]:
# Restam valores ausentes?
#df_2013['TP_REDE'].isna().sum()

0

In [144]:
# Existem outras variáveis com valores ausentes?
(df_2016.isna().mean()*100).round(2)

NU_ANO_CENSO                    0.0
CO_IES                          0.0
NO_CURSO                        0.0
CO_CURSO                        0.0
TP_GRAU_ACADEMICO               0.0
IN_GRATUITO                     0.0
TP_MODALIDADE_ENSINO            0.0
TP_NIVEL_ACADEMICO              0.0
NO_CINE_AREA_GERAL              0.0
CO_CINE_AREA_GERAL              0.0
CO_CINE_AREA_ESPECIFICA         0.0
NO_CINE_AREA_ESPECIFICA         0.0
CO_CINE_AREA_DETALHADA          0.0
NO_CINE_AREA_DETALHADA          0.0
NO_REGIAO                       0.0
CO_REGIAO                       0.0
NO_UF                           0.0
SG_UF                           0.0
CO_UF                           0.0
NO_MUNICIPIO                    0.0
CO_MUNICIPIO                    0.0
IN_CAPITAL                      0.0
TP_DIMENSAO                     0.0
TP_ORGANIZACAO_ACADEMICA        0.0
TP_REDE                         0.0
TP_CATEGORIA_ADMINISTRATIVA     0.0
QT_CURSO                        0.0
QT_VG_TOTAL                 

In [145]:
# Salvando em .csv
df_2016.to_csv('censo_2016.csv', sep=';', encoding=cod_23, index=False)

### Fim