In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Read downloaded data
enade = pd.read_csv('./enade2019/microdados_enade_2019/2019/3.DADOS/microdados_enade_2019.txt',
                    sep=';', decimal=',')

In [None]:
##### Analysing #####

In [3]:
# Descriptive statistics of the NT_GER field (general grade)
enade.NT_GER.describe()

count    390091.000000
mean         44.076610
std          14.542059
min           0.000000
25%          33.200000
50%          43.800000
75%          54.800000
max          93.000000
Name: NT_GER, dtype: float64

In [4]:
# Counting the number of null values
enade.NT_GER.isnull().sum()

43839

In [5]:
# Relative count of the number of null values
# shape.[0] returns the total quantity
enade.NT_GER.isnull().sum() / enade.shape[0]

0.10102781554628627

In [6]:
# Descriptive statistics of the NT_GER field (general grade) filtering the northeast region.
enade.loc[
    enade.CO_REGIAO_CURSO == 2
].NT_GER.describe()

count    82944.000000
mean        43.788555
std         14.622670
min          0.000000
25%         32.800000
50%         43.400000
75%         54.600000
max         91.600000
Name: NT_GER, dtype: float64

In [7]:
# Descriptive statistics of the NT_GER field (general grade) disregarding the zero values.
enade.loc[
    enade.NT_GER > 0
].NT_GER.describe()

count    389730.000000
mean         44.117437
std          14.486758
min           0.300000
25%          33.300000
50%          43.800000
75%          54.800000
max          93.000000
Name: NT_GER, dtype: float64

In [10]:
# It is interesting to disregard the grades of students who are an exception at the university, such as those under 17 years old or over 50 years old.
enade.loc[
    (enade.NU_IDADE >= 17) &
    (enade.NU_IDADE <= 50)
].NT_GER.describe()

count    385978.000000
mean         44.149186
std          14.527992
min           0.000000
25%          33.300000
50%          43.900000
75%          54.800000
max          93.000000
Name: NT_GER, dtype: float64

In [11]:
# Counting men and women
enade.TP_SEXO.value_counts()

F    233279
M    200651
Name: TP_SEXO, dtype: int64

In [5]:
# Proportion of men and women
enade.TP_SEXO.value_counts() / enade.shape[0]

F    0.537596
M    0.462404
Name: TP_SEXO, dtype: float64

In [3]:
# Comparison of the descriptive statistics of the three types of grades: NT_GER (general grade), NT_FG (general formation) and NT_CE (specific component)
enade[['NT_GER','NT_FG','NT_CE']].describe()

Unnamed: 0,NT_GER,NT_FG,NT_CE
count,390091.0,390091.0,390091.0
mean,44.07661,39.837949,45.472874
std,14.542059,16.241178,16.641654
min,0.0,0.0,0.0
25%,33.2,28.5,32.9
50%,43.8,39.3,45.2
75%,54.8,51.3,57.8
max,93.0,97.8,100.0


In [7]:
# Average of the three grade types by region
# We use a dictionary to define the fields and the aggregating function that should be applied.
# With groupby we do the grouping
enade.groupby('CO_REGIAO_CURSO').agg({
    "NT_GER":"mean"
    ,"NT_FG":"mean"
    ,"NT_CE":"mean"
})

Unnamed: 0_level_0,NT_GER,NT_FG,NT_CE
CO_REGIAO_CURSO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,40.282753,37.119037,41.320522
2,43.788555,40.057649,45.015455
3,44.016319,40.144793,45.290297
4,45.747187,40.082428,47.618845
5,44.604609,39.180022,46.396032


In [None]:
##### Transforming #####
# Fields selected to work with: CO_IES, CO_CATEGAD, CO_GRUPO, CO_MODALIDADE, CO_UF_CURSO, CO_REGIAO_CURSO, NU_IDADE, TP_SEXO, NT_GER, NT_FG, NT_CE

In [4]:
# There are 15 types of CO_CATEGAD (Administrative category). We make a new classification separating only between Public (Público) and Private (Privado).
# For this, a new column is created to receive these values. We do the classification using loc [] and the isin () function, passing a list of codes as a parameter (a list of CO_CATEGAD that fits into the Public category, and another list referring to the Private category).

enade['DESCR_CATEGORIA'] = ""
enade.loc[
    enade.CO_CATEGAD.isin([118,120,121,10005,10006,10007,10008,10009,17634])
    ,"DESCR_CATEGORIA"
] = 'Privado'

enade.loc[
    enade.CO_CATEGAD.isin([93,115,116,10001,10002,10003])
    ,"DESCR_CATEGORIA"
] = 'Público'

In [5]:
enade.DESCR_CATEGORIA.value_counts()

Privado    332879
Público    101051
Name: DESCR_CATEGORIA, dtype: int64