## Setup

Configuração do ambiente de desenvolvimento para realização do ETL (Extract Transform Load)

### Instalando bibliotecas

Instalando todas as bibliotecas necessárias e criação do ambiente spark

In [2]:
import os
import requests
import shutil
from zipfile import ZipFile

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np

# Create a Spark session
spark = SparkSession.builder.master("local").config("spark.executor.memory", "6g") \
    .config("spark.driver.memory", "6g") \
    .config("spark.driver.maxResultSize", "6g") \
    .appName("PySpark Tutorial").getOrCreate()

# Verify Spark version
print("Spark version: ", spark.version)

BASE_DOWNLOAD_URL_MICRO_DATA = 'https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_PNAD_COVID19/Microdados/Dados/PNAD_COVID_{month}{year}.zip'
BASE_DOWNLOAD_URL_GLOSSARY = 'https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_PNAD_COVID19/Microdados/Dados/PNAD_COVID_{month}{year}.zip'

INPUT_PATH = '/home/lucas-nunes/workspace/Postech/challenges/3_covid19/data/input/micro_data'

SILVER_PATH = '/home/lucas-nunes/workspace/Postech/challenges/3_covid19/data/silver'
BRONZE_PATH = '/home/lucas-nunes/workspace/Postech/challenges/3_covid19/data/bronze/data'

BASE_DOWNLOAD_PATH = '/home/lucas-nunes/workspace/Postech/challenges/3_covid19/data/input'

LIST_AVAILABLE_MONTHS = ['05', '06', '07', '08', '09', '10', '11']
LIST_AVAILABLE_YEARS = ['2020']

Spark version:  3.5.4


## Download

Download e organização das pastas em formato que possam ser utilizadas como partições spark


In [3]:
def download_file(url, path):
    local_filename = path
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    return local_filename


def extract_file(zip_path, path_extract):
    ZipFile(zip_path).extractall(path_extract)

def process_micro_data(prefix_year, prefix_month, extract_path_name):

    base_download_path_zip = os.path.join(BASE_DOWNLOAD_PATH, 'zip')

    for year in LIST_AVAILABLE_YEARS:
        for month in LIST_AVAILABLE_MONTHS:
            url_download = BASE_DOWNLOAD_URL_MICRO_DATA.format(year=year, month=month)
            path_download = os.path.join(base_download_path_zip, f'{prefix_year}={year}', f'{prefix_month}={month}')
            path_extract = os.path.join(BASE_DOWNLOAD_PATH, extract_path_name, f'{prefix_year}={year}', f'{prefix_month}={month}')
            file_path_download = os.path.join(path_download, f'{month}-{year}.zip')
            if not os.path.isdir(path_download): os.makedirs(path_download)
            if not os.path.isdir(path_extract): os.makedirs(path_extract)
            download_file(url_download, file_path_download)
            extract_file(file_path_download, path_extract)

    shutil.rmtree(base_download_path_zip)

def process_glossary_data():
    path_extract = os.path.join(BASE_DOWNLOAD_PATH, 'glossary.xlsx')
    url_download = BASE_DOWNLOAD_URL_MICRO_DATA.format(year='2020', month='11') # Usando ultimo mes, uma vez que é o dicionario mais completo
    download_file(url_download, path_extract)

prefix_year = 'ano_part'
prefix_month = 'mes'
extract_path_name = 'micro_data'

process_micro_data(prefix_year, prefix_month, extract_path_name)
process_glossary_data() # Site instável, aguardando retorno para pegar url correta

25/05/17 23:41:50 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Pre Processamento

Uma vez que os arquivos bases são grandes, para evitar esforço computacional e consumo de disco desnecessário, Estamos reescrevendo os dados Bronze para parquet, com somente os meses necessários (Necessário rodar somente uma vez, para gerar `bronze.parquet`)

In [4]:


df = spark.read.csv(INPUT_PATH, header=True)

df.write.format('parquet').mode("overwrite").partitionBy(['ano_part', 'mes']).save(BRONZE_PATH)


# df = spark.read.parquet(BRONZE_PATH, header=True)
# df.where(col('mes') >= 9).count()

25/05/17 23:42:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/05/17 23:42:20 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 148, schema size: 145
CSV file: file:///home/lucas-nunes/workspace/Postech/challenges/3_covid19/data/input/micro_data/ano_part=2020/mes=11/PNAD_COVID_112020.csv
25/05/17 23:42:44 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 114, schema size: 145
CSV file: file:///home/lucas-nunes/workspace/Postech/challenges/3_covid19/data/input/micro_data/ano_part=2020/mes=06/PNAD_COVID_062020.csv
25/05/17 23:43:05 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 114, schema size: 145
CSV file: file:///home/lucas-nunes/workspace/Postech/challe