#### Install required modules

In [None]:
%pip install boto3
%pip install pandas
%pip install pyarrow

#### Import required modules

In [1]:
import boto3
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import urllib.request
import warnings

#### Ignore warnings

In [2]:
warnings.filterwarnings("ignore")

#### Create the necessary folders to manipulate the files

In [3]:
%%bash
echo "" 
echo "Delete _datasets folder if exists."
rm -rf _datasets
echo "Create _datasets _datasets/csv _datasets/parquet _datasets/zipped."
mkdir _datasets
mkdir _datasets/csv
mkdir _datasets/parquet
mkdir _datasets/zipped


Delete _datasets folder if exists.
Create _datasets _datasets/csv _datasets/parquet _datasets/zipped.


#### Build a function to extract ZIP from URL

In [4]:
def extract_data_from_enem(year : str) -> bool:
    """
    Extract data about Exame Nacional do Ensino Médio (ENEM) during an specific given period.

    Args: 
        year (str): given period.

    Returns:
        False if any error has been generated during downloading process. 
        True if no error has been generated during downloading process.
    """
    try:
        url = "https://download.inep.gov.br/microdados/microdados_enem_{year}.zip".format(year=year)
        output = "_datasets/zipped/micro-dados-enem-{year}.zip".format(year=year)
        print(f"\nDownloading zipped file from {url}...")
        urllib.request.urlretrieve(url, output)
        print(f"Zipped file wrote in {output}.")
        return True
    except:
        print("\nError during zipped file downloading proccess.")
        return False

#### Download the ZIP file using the function created above

In [5]:
[extract_data_from_enem(year=year) for year in ["2020", "2021", "2022", "2023"]]


Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2020.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2020.zip.

Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2021.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2021.zip.

Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2022.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2022.zip.

Downloading zipped file from https://download.inep.gov.br/microdados/microdados_enem_2023.zip...
Zipped file wrote in _datasets/zipped/micro-dados-enem-2023.zip.


[True, True, True, True]

#### Unzip the CSV file from ZIP

In [6]:
%%bash

years=()
directory="_datasets/zipped/*"

for file in $directory
do 
    filename=$(basename "$file")
    filename=${filename:17:4}
    years+=("$filename")
done

for year in ${years[@]}
do
    echo ""
    echo "Unzipping file micro-dados-enem-$year.zip from _datasets/zipped..."
    unzip -qq "_datasets/zipped/micro-dados-enem-$year.zip"
    echo "Moving file micro-dados-enem-$year.csv to _datasets/csv..."
    mv "DADOS/MICRODADOS_ENEM_$year.csv" "_datasets/csv/micro-dados-enem-$year.csv"
    echo "CSV file moved to _datasets/csv/micro-dados-enem-$year.csv."
    rm -rf "DADOS" "DICIONÁRIO" "INPUTS" "LEIA-ME E DOCUMENTOS TÉCNICOS" "PROVAS E GABARITOS"
    clear
done


Unzipping file micro-dados-enem-2020.zip from _datasets/zipped...
Moving file micro-dados-enem-2020.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2020.csv.
[H[2J
Unzipping file micro-dados-enem-2021.zip from _datasets/zipped...
Moving file micro-dados-enem-2021.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2021.csv.
[H[2J
Unzipping file micro-dados-enem-2022.zip from _datasets/zipped...
Moving file micro-dados-enem-2022.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2022.csv.
[H[2J
Unzipping file micro-dados-enem-2023.zip from _datasets/zipped...
Moving file micro-dados-enem-2023.csv to _datasets/csv...
CSV file moved to _datasets/csv/micro-dados-enem-2023.csv.
[H[2J

#### Transform the CSV to PARQUET file

In [7]:
def write_parquet(data : pd.DataFrame, filePath : str) -> None:
    """
    Write a parquet file if it does not exist or append an existing parquet file.

    Args: 
        data (pd.DataFrame): a dataframe from pandas.
        filePath (str): path to write the parquet file.
    """
    dataframe = pa.Table.from_pandas(data.astype(str))
    if not os.path.exists(filePath):
        pq.write_table(dataframe, filePath, compression="snappy")
    else:
        existingDataframe = pq.read_table(filePath)
        combinedDataframe = pa.concat_tables([existingDataframe, dataframe])
        pq.write_table(combinedDataframe, filePath, compression="snappy")

In [8]:
def convert_csv_to_parquet(csvFilePath, parquetFilePath) -> bool:
    """
    Read a CSV file, chunk it and write each part using the write_parquet function.

    Args: 
        csvFilePath (str): directory from csv file.
        parquetFilePath (str): directory from parquet file.

    Returns:
        False if any error has been generated during converting process. 
        True if no error has been generated during converting process.
    """
    try:
        chunksize = 1_000_000
        
        print(f"\nReading {csvFilePath} from _datasets/csv...")
        chunks = pd.read_csv(csvFilePath, delimiter=";", encoding="latin-1", chunksize=chunksize)
        
        for i, chunk in enumerate(chunks):
            print(f"Converting the {i * chunksize} lines from csv to parquet...")
            write_parquet(chunk, parquetFilePath)

        print(f"Parquet file wrote in {parquetFilePath}.")
        return True
    except Exception as e:
        print(f"Error {e} during CSV to parquet conversion.")
        return False

In [9]:
[convert_csv_to_parquet(f"_datasets/csv/micro-dados-enem-{year}.csv", f"_datasets/parquet/micro-dados-enem-{year}.parquet") for year in ["2020", "2021", "2022", "2023"]]


Reading _datasets/csv/micro-dados-enem-2020.csv from _datasets/csv...
Converting the 0 lines from csv to parquet...
Converting the 1000000 lines from csv to parquet...
Converting the 2000000 lines from csv to parquet...
Converting the 3000000 lines from csv to parquet...
Converting the 4000000 lines from csv to parquet...
Converting the 5000000 lines from csv to parquet...
Parquet file wrote in _datasets/parquet/micro-dados-enem-2020.parquet.

Reading _datasets/csv/micro-dados-enem-2021.csv from _datasets/csv...
Converting the 0 lines from csv to parquet...
Converting the 1000000 lines from csv to parquet...
Converting the 2000000 lines from csv to parquet...
Converting the 3000000 lines from csv to parquet...
Parquet file wrote in _datasets/parquet/micro-dados-enem-2021.parquet.

Reading _datasets/csv/micro-dados-enem-2022.csv from _datasets/csv...
Converting the 0 lines from csv to parquet...
Converting the 1000000 lines from csv to parquet...
Converting the 2000000 lines from csv t

[True, True, True, True]

#### Connect with AWS using BOTO3

In [10]:
AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.environ.get("AWS_SECRET_KEY")
AWS_REGION = "sa-east-1"

boto3.setup_default_session(
    aws_access_key_id = AWS_ACCESS_KEY,
    aws_secret_access_key = AWS_SECRET_KEY,
    region_name = AWS_REGION
)

s3 = boto3.client("s3")

In [11]:
AWS_BUCKET_NAME = os.environ.get("AWS_BUCKET_NAME")
parquetFiles = sorted(os.listdir("_datasets/parquet"))

for parquet in parquetFiles:
    print(f"\nUploading {parquet} from _datasets/parquet to AWS bucket...")
    s3.upload_file(
        Filename = f"_datasets/parquet/{parquet}",
        Bucket = AWS_BUCKET_NAME,
        Key = f"sor/{parquet}"
    )
    print(f"Parquet file {parquet} uploaded.")


Uploading micro-dados-enem-2020.parquet from _datasets/parquet to AWS bucket...
Parquet file micro-dados-enem-2020.parquet uploaded.

Uploading micro-dados-enem-2021.parquet from _datasets/parquet to AWS bucket...
Parquet file micro-dados-enem-2021.parquet uploaded.

Uploading micro-dados-enem-2022.parquet from _datasets/parquet to AWS bucket...
Parquet file micro-dados-enem-2022.parquet uploaded.

Uploading micro-dados-enem-2023.parquet from _datasets/parquet to AWS bucket...
Parquet file micro-dados-enem-2023.parquet uploaded.
