# **Preparing Brazil's National Exam (ENEM) 2022 microdata for analysis**

## **Data loading**

In [3]:
import os
import pandas as pd
import sys
import warnings
from IPython.display import display
warnings.filterwarnings('ignore')

# Finds the .csv files in the data/microdados_enem_2022/DADOS folder
data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'microdados_enem_2022', 'DADOS'))
files = {
    'microdata': 'MICRODADOS_ENEM_2022.csv',
    'questionnaire': 'QUEST_HAB_ESTUDO.csv',
    'exam_items': 'ITENS_PROVA_2022.csv'
}

try:
    # Reads the files
    df_microdata = pd.read_csv(os.path.join(data_path, files['microdata']), sep=';', encoding='ISO-8859-1')
    df_questionnaire = pd.read_csv(os.path.join(data_path, files['questionnaire']), sep=';', encoding='ISO-8859-1')
    df_exam_items = pd.read_csv(os.path.join(data_path, files['exam_items']), sep=';', encoding='ISO-8859-1')
except FileNotFoundError:
    print('Error: Files not found.')
    print("Did you forget to run 'scripts/download_microdados_2022.py'?")
    sys.exit()

# Renames study habits questionnaire columns starting with 'Q', adding the '_hab' suffix
df_questionnaire = df_questionnaire.rename(columns={col: col + '_hab' for col in df_questionnaire.columns if col.startswith('Q')})

# Merges microdata and questionnaire based on the 'NU_INSCRICAO' column
df = df_microdata.merge(df_questionnaire, on='NU_INSCRICAO', how='left')
del df_microdata, df_questionnaire

display(df_exam_items)
display(df)

Unnamed: 0,CO_POSICAO,SG_AREA,CO_ITEM,TX_GABARITO,CO_HABILIDADE,IN_ITEM_ABAN,TX_MOTIVO_ABAN,NU_PARAM_A,NU_PARAM_B,NU_PARAM_C,TX_COR,CO_PROVA,TP_LINGUA,IN_ITEM_ADAPTADO
0,46,CH,140506,D,26,0,,0.98121,1.34897,0.06218,AZUL,1055,,0
1,90,CH,111984,A,3,0,,1.90716,1.36087,0.22521,AZUL,1055,,0
2,89,CH,140190,C,5,0,,1.94428,0.27387,0.14788,AZUL,1055,,0
3,88,CH,96516,E,2,0,,1.96743,2.20588,0.08863,AZUL,1055,,0
4,87,CH,140572,B,19,0,,2.46583,0.73772,0.17053,AZUL,1055,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5360,96,CN,83901,B,8,0,,0.40583,2.51676,0.00918,CINZA,1190,,0
5361,95,CN,88655,D,4,0,,3.82953,0.85524,0.11511,CINZA,1190,,0
5362,94,CN,141503,C,3,0,,2.86161,0.49480,0.16060,CINZA,1190,,0
5363,93,CN,82765,E,1,0,,0.98394,1.03388,0.00428,CINZA,1190,,0


Unnamed: 0,NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,...,Q033B_hab,Q033C_hab,Q033D_hab,Q033E_hab,Q033F_hab,Q033G_hab,Q033H_hab,Q033I_hab,Q033J_hab,Q034_hab
0,210057943671,2022,14,M,2,2,1,1,2,1,...,,,,,,,,,,
1,210057516120,2022,14,M,2,1,1,1,16,1,...,,,,,,,,,,
2,210057280536,2022,5,F,1,2,1,1,2,1,...,,,,,,,,,,
3,210055724397,2022,6,M,1,3,1,1,2,1,...,,,,,,,,,,C
4,210055097896,2022,4,M,0,3,1,1,1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3476103,210056389903,2022,3,M,1,1,1,2,0,2,...,,,,,,,,,,
3476104,210057205644,2022,14,F,2,1,1,2,0,2,...,,,,,,,,,,
3476105,210056699189,2022,2,M,1,1,1,2,0,2,...,,,,,,,,,,
3476106,210056983033,2022,3,M,1,3,1,2,0,3,...,,,,,,,,,,


## **Preprocessing**

In [4]:
# Keeps only the most relevant columns
df = df[[
    'NU_INSCRICAO',
    # 'NU_ANO',               -- Irrelevant, all values are 2022
    'TP_FAIXA_ETARIA',
    'TP_SEXO',
    'TP_ESTADO_CIVIL',
    'TP_COR_RACA',
    'TP_NACIONALIDADE',
    'TP_ST_CONCLUSAO',
    'TP_ANO_CONCLUIU',
    'TP_ESCOLA',
    'TP_ENSINO',
    'IN_TREINEIRO',
    # 'CO_MUNICIPIO_ESC',     -- Irrelevant ID
    'NO_MUNICIPIO_ESC',
    # 'CO_UF_ESC',            -- Irrelevant ID
    'SG_UF_ESC',
    'TP_DEPENDENCIA_ADM_ESC',
    'TP_LOCALIZACAO_ESC',
    'TP_SIT_FUNC_ESC',
    # 'CO_MUNICIPIO_PROVA',   -- Irrelevant ID
    'NO_MUNICIPIO_PROVA',
    # 'CO_UF_PROVA',          -- Irrelevant ID
    'SG_UF_PROVA',
    'TP_PRESENCA_CN',
    'TP_PRESENCA_CH',
    'TP_PRESENCA_LC',
    'TP_PRESENCA_MT',
    # 'CO_PROVA_CN',          -- Irrelevant ID
    # 'CO_PROVA_CH',          -- Irrelevant ID
    # 'CO_PROVA_LC',          -- Irrelevant ID
    # 'CO_PROVA_MT',          -- Irrelevant ID
    'NU_NOTA_CN',
    'NU_NOTA_CH',
    'NU_NOTA_LC',
    'NU_NOTA_MT',
    # 'TX_RESPOSTAS_CN',      -- Answer vectors are not needed as we have the final scores
    # 'TX_RESPOSTAS_CH',      -- Answer vectors are not needed as we have the final scores
    # 'TX_RESPOSTAS_LC',      -- Answer vectors are not needed as we have the final scores
    # 'TX_RESPOSTAS_MT',      -- Answer vectors are not needed as we have the final scores
    'TP_LINGUA',
    # 'TX_GABARITO_CN',       -- Answer keys are not needed as we have the final scores
    # 'TX_GABARITO_CH',       -- Answer keys are not needed as we have the final scores
    # 'TX_GABARITO_LC',       -- Answer keys are not needed as we have the final scores
    # 'TX_GABARITO_MT',       -- Answer keys are not needed as we have the final scores
    'TP_STATUS_REDACAO',
    # 'NU_NOTA_COMP1',        -- Detailed essay score components will not be used
    # 'NU_NOTA_COMP2',        -- Detailed essay score components will not be used
    # 'NU_NOTA_COMP3',        -- Detailed essay score components will not be used
    # 'NU_NOTA_COMP4',        -- Detailed essay score components will not be used
    # 'NU_NOTA_COMP5',        -- Detailed essay score components will not be used
    'NU_NOTA_REDACAO',
    
    # Socioeconomic questionnaire answers
    'Q001',    'Q002',    'Q003',    'Q004',    'Q005',    'Q006',    'Q007',    'Q008',    'Q009',    'Q010',
    'Q011',    'Q012',    'Q013',    'Q014',    'Q015',    'Q016',    'Q017',    'Q018',    'Q019',    'Q020',
    'Q021',    'Q022',    'Q023',    'Q024',    'Q025',

    # Study habits during pandemic questionnaire answers
    'TP_RESPOSTA',
    'Q001_hab',    'Q002_hab',    'Q003_hab',    'Q004_hab',    'Q005_hab',    'Q006_hab',    'Q007_hab',    'Q008_hab',    'Q009_hab',    'Q010_hab',
    'Q011_hab',    'Q012_hab',    'Q013_hab',    'Q014_hab',    'Q015_hab',    'Q016_hab',    'Q017_hab',    'Q018_hab',    'Q019_hab',    'Q020_hab',
    'Q021_hab',    'Q022_hab',    'Q023_hab',    'Q024_hab',    'Q025A_hab',    'Q025B_hab',    'Q025C_hab',    'Q025D_hab',    'Q025E_hab',    'Q025F_hab',
    'Q026A_hab',    'Q026B_hab',    'Q026C_hab',    'Q026D_hab',    'Q026E_hab',    'Q026F_hab',    'Q026G_hab',    'Q026H_hab',    'Q027_hab',    'Q028A_hab',
    'Q028B_hab',    'Q028C_hab',    'Q028D_hab',    'Q028E_hab',    'Q028F_hab',    'Q028G_hab',    'Q028H_hab',    'Q028I_hab',    'Q028J_hab',    'Q028K_hab',
    'Q028L_hab',    'Q028M_hab',    'Q028N_hab',    'Q028O_hab',    'Q028P_hab',    'Q028Q_hab',    'Q028R_hab',    'Q029_hab',    'Q030A_hab',    'Q030B_hab',
    'Q030C_hab',    'Q030D_hab',    'Q030E_hab',    'Q030F_hab',    'Q030G_hab',    'Q031_hab',    'Q032A_hab',    'Q032B_hab',    'Q032C_hab',    'Q032D_hab',
    'Q032E_hab',    'Q032F_hab',    'Q032G_hab',    'Q033A_hab',    'Q033B_hab',    'Q033C_hab',    'Q033D_hab',    'Q033E_hab',    'Q033F_hab',    'Q033G_hab',
    'Q033H_hab',    'Q033I_hab',    'Q033J_hab',    'Q034_hab'
]]

display(df.shape)

(3476108, 139)

In [5]:
# Mapping dictionary for categorical columns
mappings = {
    'TP_FAIXA_ETARIA': {
        1: 'Under 17 years old',
        2: '17 years old',
        3: '18 years old',
        4: '19 years old',
        5: '20 years old',
        6: '21 years old',
        7: '22 years old',
        8: '23 years old',
        9: '24 years old',
        10: '25 years old',
        11: 'Between 26 and 30 years old',
        12: 'Between 31 and 35 years old',
        13: 'Between 36 and 40 years old',
        14: 'Between 41 and 45 years old',
        15: 'Between 46 and 50 years old',
        16: 'Between 51 and 55 years old',
        17: 'Between 56 and 60 years old',
        18: 'Between 61 and 65 years old',
        19: 'Between 66 and 70 years old',
        20: 'Over 70 years old'
    },
    'TP_SEXO': {'M': 'Male', 'F': 'Female'},
    'TP_ESTADO_CIVIL': {
        0: 'Not informed',
        1: 'Single',
        2: 'Married/Living with partner',
        3: 'Divorced/Separated',
        4: 'Widowed'
    },
    'TP_COR_RACA': {
        0: 'Not declared',
        1: 'White',
        2: 'Black',
        3: 'Mixed (Pardo)',
        4: 'Asian',
        5: 'Indigenous',
        6: 'Information not available'
    },
    'TP_NACIONALIDADE': {
        0: 'Not informed',
        1: 'Brazilian',
        2: 'Naturalized Brazilian',
        3: 'Foreigner',
        4: 'Brazilian born abroad'
    },
    'TP_ST_CONCLUSAO': {
        1: 'High School completed',
        2: 'Completing High School in 2022',
        3: 'Completing High School after 2022',
        4: 'High School not completed and not attending'
    },
    'TP_ANO_CONCLUIU': {
        0: 'Not informed',
        1: '2021',
        2: '2020',
        3: '2019',
        4: '2018',
        5: '2017',
        6: '2016',
        7: '2015',
        8: '2014',
        9: '2013',
        10: '2012',
        11: '2011',
        12: '2010',
        13: '2009',
        14: '2008',
        15: '2007',
        16: 'Before 2007'
    },
    'TP_ESCOLA': {1: 'No response', 2: 'Public', 3: 'Private'},
    'TP_ENSINO': {1: 'Regular Education', 2: 'Special Education'},
    'IN_TREINEIRO': {1: 'Yes', 0: 'No'},
    'TP_DEPENDENCIA_ADM_ESC': {1: 'Federal', 2: 'State', 3: 'Municipal', 4: 'Private'},
    'TP_LOCALIZACAO_ESC': {1: 'Urban', 2: 'Rural'},
    'TP_SIT_FUNC_ESC': {1: 'Operating', 2: 'Suspended', 3: 'Extinct', 4: 'Extinct in previous years'},
    'TP_PRESENCA_CN': {0: 'Absent', 1: 'Present', 2: 'Disqualified'},
    'TP_PRESENCA_CH': {0: 'Absent', 1: 'Present', 2: 'Disqualified'},
    'TP_PRESENCA_LC': {0: 'Absent', 1: 'Present', 2: 'Disqualified'},
    'TP_PRESENCA_MT': {0: 'Absent', 1: 'Present', 2: 'Disqualified'},
    'TP_LINGUA': {0: 'English', 1: 'Spanish'},
    'TP_STATUS_REDACAO': {
        1: 'No issues',
        2: 'Nullified',
        3: 'Plagiarism of prompt text',
        4: 'Blank',
        6: 'Off-topic',
        7: 'Incorrect text type',
        8: 'Insufficient text',
        9: 'Disconnected part'
    }
}

# Applies the mapping
df.replace(mappings, inplace=True)
df

Unnamed: 0,NU_INSCRICAO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,...,Q033B_hab,Q033C_hab,Q033D_hab,Q033E_hab,Q033F_hab,Q033G_hab,Q033H_hab,Q033I_hab,Q033J_hab,Q034_hab
0,210057943671,Between 41 and 45 years old,Male,Married/Living with partner,Black,Brazilian,High School completed,2020,No response,,...,,,,,,,,,,
1,210057516120,Between 41 and 45 years old,Male,Married/Living with partner,White,Brazilian,High School completed,Before 2007,No response,,...,,,,,,,,,,
2,210057280536,20 years old,Female,Single,Black,Brazilian,High School completed,2020,No response,,...,,,,,,,,,,
3,210055724397,21 years old,Male,Single,Mixed (Pardo),Brazilian,High School completed,2020,No response,,...,,,,,,,,,,C
4,210055097896,19 years old,Male,Not informed,Mixed (Pardo),Brazilian,High School completed,2021,No response,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3476103,210056389903,18 years old,Male,Single,White,Brazilian,Completing High School in 2022,Not informed,Public,Regular Education,...,,,,,,,,,,
3476104,210057205644,Between 41 and 45 years old,Female,Married/Living with partner,White,Brazilian,Completing High School in 2022,Not informed,Public,,...,,,,,,,,,,
3476105,210056699189,17 years old,Male,Single,White,Brazilian,Completing High School in 2022,Not informed,Public,Regular Education,...,,,,,,,,,,
3476106,210056983033,18 years old,Male,Single,Mixed (Pardo),Brazilian,Completing High School in 2022,Not informed,Private,Regular Education,...,,,,,,,,,,


In [6]:
# Mapping dictionary for categorical columns of the socioeconomic questionnaire
socio_mappings = {
    'Q001': {
        'A': 'Never studied.',
        'B': 'Did not complete 4th grade/5th year of Elementary School.',
        'C': 'Completed 4th grade/5th year, but did not complete 8th grade/9th year of Elementary School.',
        'D': 'Completed 8th grade/9th year of Elementary School, but did not complete High School.',
        'E': 'Completed High School, but did not complete College.',
        'F': 'Completed College, but did not complete Graduate School.',
        'G': 'Completed Graduate School.',
        'H': 'Unknown.'
    },
    'Q002': {
        'A': 'Never studied.',
        'B': 'Did not complete 4th grade/5th year of Elementary School.',
        'C': 'Completed 4th grade/5th year, but did not complete 8th grade/9th year of Elementary School.',
        'D': 'Completed 8th grade/9th year of Elementary School, but did not complete High School.',
        'E': 'Completed High School, but did not complete College.',
        'F': 'Completed College, but did not complete Graduate School.',
        'G': 'Completed Graduate School.',
        'H': 'Unknown.'
    },
    'Q003': {
        'A': 'Group 1: Farmer, fisherman, extractive worker.',
        'B': 'Group 2: Domestic worker, security guard, doorman, salesperson, cashier.',
        'C': 'Group 3: Cook, factory worker, driver, electrician.',
        'D': 'Group 4: Teacher, technician, police officer, manager, micro-entrepreneur.',
        'E': 'Group 5: Doctor, engineer, lawyer, judge, company director.',
        'F': 'Unknown.'
    },
    'Q004': {
        'A': 'Group 1: Farmer, fisherman, extractive worker.',
        'B': 'Group 2: Domestic worker, security guard, doorman, salesperson, cashier.',
        'C': 'Group 3: Cook, factory worker, driver, electrician.',
        'D': 'Group 4: Teacher, technician, police officer, manager, micro-entrepreneur.',
        'E': 'Group 5: Doctor, engineer, lawyer, judge, company director.',
        'F': 'Unknown.'
    },
    'Q005': {i: str(i) for i in range(1, 21)},  # Number of people living in the household
    'Q006': {
        'A': 'No income',
        'B': 'Up to R$ 1,212.00',
        'C': 'From R$ 1,212.01 to R$ 1,818.00.',
        'D': 'From R$ 1,818.01 to R$ 2,424.00.',
        'E': 'From R$ 2,424.01 to R$ 3,030.00.',
        'F': 'From R$ 3,030.01 to R$ 3,636.00.',
        'G': 'From R$ 3,636.01 to R$ 4,848.00.',
        'H': 'From R$ 4,848.01 to R$ 6,060.00.',
        'I': 'From R$ 6,060.01 to R$ 7,272.00.',
        'J': 'From R$ 7,272.01 to R$ 8,484.00.',
        'K': 'From R$ 8,484.01 to R$ 9,696.00.',
        'L': 'From R$ 9,696.01 to R$ 10,908.00.',
        'M': 'From R$ 10,908.01 to R$ 12,120.00.',
        'N': 'From R$ 12,120.01 to R$ 14,544.00.',
        'O': 'From R$ 14,544.01 to R$ 18,180.00.',
        'P': 'From R$ 18,180.01 to R$ 24,240.00.',
        'Q': 'Above R$ 24,240.00.'
    },
    'Q007': {
        'A': 'No.',
        'B': 'Yes, one or two days a week.',
        'C': 'Yes, three or four days a week.',
        'D': 'Yes, at least five days a week.'
    }
}

# Maps consumer goods questions (Q008 to Q033)
for i in range(8, 34):
    col = f'Q{i:03d}'
    socio_mappings[col] = {
        'A': 'No.',
        'B': 'Yes.',
        'C': 'Yes, two.',
        'D': 'Yes, three.',
        'E': 'Yes, four or more.'
    }

# Maps internet and digital services access questions
for col in ['Q034', 'Q035', 'Q036']:
    socio_mappings[col] = {
        'A': 'No.',
        'B': 'Yes.'
    }

# Mapping dictionary for categorical columns of the study habits questionnaire
habits_mappings = {
    'TP_RESPOSTA_hab': {
        0: 'No (Participant opted not to answer the questionnaire)',
        1: 'Yes (Participant opted to answer the questionnaire)',
        2: 'Answer later'
    },
    'Q001_hab': {
        'A': 'I was enrolled in regular High School.',
        'B': 'I was enrolled in High School, in an EJA (Adult Education) class.',
        'C': 'I was enrolled in High School integrated with Professional Education.',
        'D': 'I completed High School before 2021.',
        'E': 'I did not complete High School and was not enrolled during this period.'
    },
    'Q002_hab': {
        'A': 'I continued my studies without interrupting the school year.',
        'B': 'I decided to interrupt my studies in the first year of the pandemic, but re-enrolled in 2021.',
        'C': 'I was forced to interrupt my studies in the first year of the pandemic due to lack of school support, but re-enrolled in 2021.'
    },
    'Q003_hab': {
        'A': 'I only had in-person classes during this period.',
        'B': 'I only studied at home remotely during this period.',
        'C': 'I studied in a hybrid format, both at home remotely and in-person classes.',
        'D': 'I performed various study activities, but without being enrolled in High School.',
        'E': 'I was not regularly enrolled, nor did I perform study activities.'
    },
    'Q004_hab': {
        'A': 'I learned more through remote learning (studying only at home).',
        'B': 'I learned more through hybrid learning.',
        'C': 'I learned more through in-person learning.',
        'D': 'I learned as much through remote or hybrid learning as I did in-person.',
        'E': 'I was not enrolled in 2021, but I learned by studying on my own.',
        'F': 'I was not enrolled in 2021 and did not study.'
    },
}

# Maps study questions (Q005 to Q034)
for i in range(5, 35):
    col = f'Q{i:03d}_hab'
    
    if i <= 24:
        habits_mappings[col] = {
            'A': 'Never.',
            'B': 'A few times.',
            'C': 'Many times.',
            'D': 'Every time.'
        }
    elif i == 25 or i == 26:
        for sub in 'ABCDEFGH':
            habits_mappings[f'Q{i:03d}{sub}_hab'] = {
                'A': 'Yes.',
                'B': 'No.'
            }
    elif i == 27 or i == 29:
        habits_mappings[col] = {
            'A': 'Yes.',
            'B': 'No.'
        }
    elif i == 28 or i == 30 or i == 32 or i == 33:
        for sub in 'ABCDEFGHIJKLMNOPQ':
            habits_mappings[f'Q{i:03d}{sub}_hab'] = {
                'A': 'Yes.',
                'B': 'No.'
            }
    elif i == 31:
        habits_mappings[col] = {
            'A': 'Yes.',
            'B': 'No.',
            'C': 'I needed help to study, but had no one to assist me.'
        }
    elif i == 34:
        habits_mappings[col] = {
            'A': 'Not prepared at all.',
            'B': 'A little prepared.',
            'C': 'Well prepared.',
            'D': 'Very prepared.',
            'E': 'Fully prepared.'
        }

tp_response_mapping = {
    0: 'No (Participant opted not to answer the questionnaire).',
    1: 'Yes (Participant opted to answer the questionnaire).',
    2: 'Answer later'
}

# Applies mappings
df.replace(socio_mappings, inplace=True)
df.replace(habits_mappings, inplace=True)
df['TP_RESPOSTA'].replace(tp_response_mapping, inplace=True)
df

Unnamed: 0,NU_INSCRICAO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,...,Q033B_hab,Q033C_hab,Q033D_hab,Q033E_hab,Q033F_hab,Q033G_hab,Q033H_hab,Q033I_hab,Q033J_hab,Q034_hab
0,210057943671,Between 41 and 45 years old,Male,Married/Living with partner,Black,Brazilian,High School completed,2020,No response,,...,,,,,,,,,,
1,210057516120,Between 41 and 45 years old,Male,Married/Living with partner,White,Brazilian,High School completed,Before 2007,No response,,...,,,,,,,,,,
2,210057280536,20 years old,Female,Single,Black,Brazilian,High School completed,2020,No response,,...,,,,,,,,,,
3,210055724397,21 years old,Male,Single,Mixed (Pardo),Brazilian,High School completed,2020,No response,,...,,,,,,,,,,Well prepared.
4,210055097896,19 years old,Male,Not informed,Mixed (Pardo),Brazilian,High School completed,2021,No response,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3476103,210056389903,18 years old,Male,Single,White,Brazilian,Completing High School in 2022,Not informed,Public,Regular Education,...,,,,,,,,,,
3476104,210057205644,Between 41 and 45 years old,Female,Married/Living with partner,White,Brazilian,Completing High School in 2022,Not informed,Public,,...,,,,,,,,,,
3476105,210056699189,17 years old,Male,Single,White,Brazilian,Completing High School in 2022,Not informed,Public,Regular Education,...,,,,,,,,,,
3476106,210056983033,18 years old,Male,Single,Mixed (Pardo),Brazilian,Completing High School in 2022,Not informed,Private,Regular Education,...,,,,,,,,,,


In [7]:
# Saves the preprocessed dataframe for the next stage (EDA)
df.to_csv('../data/df_prep.csv', sep=';', encoding='iso-8859-1', index=False)