#### Install required modules

In [None]:
%pip install pandas
%pip install boto3

#### Import required modules

In [2]:
import boto3
import os
import pandas as pd
import urllib.request
import warnings

#### Ignore warnings

In [3]:
warnings.filterwarnings("ignore")

#### Create the necessary folders to manipulate the files

In [4]:
!echo "" 
!echo "Delete _datasets folder if exists."
%rm -rf _datasets
!echo "Create _datasets _datasets/csv _datasets/parquet _datasets/zipped."
%mkdir _datasets
%mkdir _datasets/csv
%mkdir _datasets/parquet
%mkdir _datasets/zipped


Delete _datasets folder if exists.
Create _datasets _datasets/csv _datasets/parquet _datasets/zipped.


#### Build a function to extract ZIP from URL

In [5]:
def extract_data_from_enem(year : str) -> bool:
    """
    Extract data about Exame Nacional do Ensino Médio (ENEM) during an specific given period.

    Args: 
        year (str): given period.

    Returns:
        0 if any error has been generated during downloading process. 
        1 if no error has been generated during downloading process.
    """
    try:
        url = "https://download.inep.gov.br/microdados/microdados_enem_{year}.zip".format(year=year)
        output = "_datasets/zipped/micro-dados-enem-{year}.zip".format(year=year)
        print(f"\nDownloading zipped file from {url}...")
        urllib.request.urlretrieve(url, output)
        print(f"Zipped file wrote in {output}.")
        return True
    except:
        print("\nError during zipped file downloading proccess.")
        return False

#### Download the ZIP file using the function created above

In [6]:
[extract_data_from_enem(year=year) for year in ["2020", "2021", "2022", "2023"]]


Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2020.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2020.zip.

Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2021.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2021.zip.

Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2022.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2022.zip.

Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2023.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2023.zip.


[True, True, True, True]

#### Unzip the CSV file from ZIP

In [7]:
%%bash
for YEAR in {2020..2023}
do
    echo ""
    echo "Unzipping file micro-dados-enem-$YEAR.zip from _datasets/zipped..."
    unzip -qq "_datasets/zipped/micro-dados-enem-$YEAR.zip"
    echo "Moving file micro-dados-enem-$YEAR.csv to _datasets/csv..."
    mv "DADOS/MICRODADOS_ENEM_$YEAR.csv" "_datasets/csv/micro-dados-enem-$YEAR.csv"
    echo "CSV file moved to _datasets/csv/micro-dados-enem-$YEAR.csv."
    rm -rf "DADOS" "DICIONÁRIO" "INPUTS" "LEIA-ME E DOCUMENTOS TÉCNICOS" "PROVAS E GABARITOS"
    clear
done


Unzipping file micro-dados-enem-2020.zip from _datasets/zipped...
Moving file micro-dados-enem-2020.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2020.csv.
[H[2J
Unzipping file micro-dados-enem-2021.zip from _datasets/zipped...
Moving file micro-dados-enem-2021.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2021.csv.
[H[2J
Unzipping file micro-dados-enem-2022.zip from _datasets/zipped...
Moving file micro-dados-enem-2022.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2022.csv.
[H[2J
Unzipping file micro-dados-enem-2023.zip from _datasets/zipped...
Moving file micro-dados-enem-2023.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2023.csv.
[H[2J

#### Transforming the CSV to PARQUET files

In [8]:
csvFiles = sorted(os.listdir("_datasets/csv"))

for file in csvFiles:
    print(f"\nReading CSV file {file} from _datasets/csv...")
    chunks = pd.read_csv(f"_datasets/csv/{file}", delimiter=";", encoding="latin-1", chunksize=1_000_000)
    for i, chunk in enumerate(chunks):
        print(f"Transforming {file} to {str(file)[:21]}-0{i}.parquet...")
        chunk.to_parquet(f"_datasets/parquet/{str(file)[:21]}-0{i}.parquet", engine="pyarrow", compression="snappy", index=False)
    print(f"Parquet file wrote in _datasets/parquet/{str(file)[:21]}.parquet.")


Reading CSV file micro-dados-enem-2020.csv from _datasets/csv...
Transforming micro-dados-enem-2020.csv to micro-dados-enem-2020-00.parquet...
Transforming micro-dados-enem-2020.csv to micro-dados-enem-2020-01.parquet...
Transforming micro-dados-enem-2020.csv to micro-dados-enem-2020-02.parquet...
Transforming micro-dados-enem-2020.csv to micro-dados-enem-2020-03.parquet...
Transforming micro-dados-enem-2020.csv to micro-dados-enem-2020-04.parquet...
Transforming micro-dados-enem-2020.csv to micro-dados-enem-2020-05.parquet...
Parquet file wrote in _datasets/parquet/micro-dados-enem-2020.parquet.

Reading CSV file micro-dados-enem-2021.csv from _datasets/csv...
Transforming micro-dados-enem-2021.csv to micro-dados-enem-2021-00.parquet...
Transforming micro-dados-enem-2021.csv to micro-dados-enem-2021-01.parquet...
Transforming micro-dados-enem-2021.csv to micro-dados-enem-2021-02.parquet...
Transforming micro-dados-enem-2021.csv to micro-dados-enem-2021-03.parquet...
Parquet file wrot