In [1]:
# Necessário possuir a biblioteca DuckDB, holidays e babel instaladas.
!pip install duckdb
!pip install babel
!pip install holidays
!pip install gdown



In [2]:
import sys

# Getting Github files if this notebook is executed in a Google Colab environment.
if 'google.colab' in sys.modules:
  temp_folder = "etl_bank"

  !git clone -b 'dev' 'https://github.com/jpclarindo/etl_bank.git' $temp_folder
  !rsync -a $temp_folder/ .
  !rm -rf $temp_folder


Cloning into 'etl_bank'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 21 (delta 5), reused 12 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (21/21), 9.16 KiB | 9.16 MiB/s, done.
Resolving deltas: 100% (5/5), done.


Important! Google Drive Link for data source is required to execute this notebook. Please, put the link in gdrive_link.txt at the root.

In [3]:
import pandas as pd
import duckdb
import src.utils as utils

# Libraries to create date dimension
from datetime import datetime, timedelta
from babel.dates import format_date, format_datetime, format_time
import holidays

# Auxiliar libraries
import time
import json
import os, glob

In [12]:
# Init DuckDB connection.
con = duckdb.connect(database='database.duckdb', read_only=False)

# Creating cleaning function mapping for each attribute
cleaning_mapping = json.load(open('src/function_mapping.json','r',encoding='utf-8'))

# Creating date mapping
date_mapping = json.load(open('src/date_mapping.json','r',encoding='utf-8'))

Creating time and date dimension tables.

In [5]:
# @title
def create_time_dimension_table():
  time_list = []
  time_of_day = ['Early Morning', 'Morning', 'Afternoon', 'Night']
  time_of_day_pt = ['Madrugada', 'Manhã', 'Tarde', 'Noite']

  for i in range(86400):
    min_sec = i % 3600

    time_dict = {'tempo_id': i+1,
                'hora': i // 3600,
                'minuto': min_sec // 60,
                'segundo': min_sec % 60
                }

    time_dict['periodo_en'] = time_of_day[time_dict['hora'] // 6]
    time_dict['periodo_pt'] = time_of_day_pt[time_dict['hora'] // 6]
    time_dict['formatado'] = f"{time_dict['hora']:02d}:{time_dict['minuto']:02d}:{time_dict['segundo']:02d}"

    time_list.append(time_dict)


  time_df = pd.DataFrame(time_list)
  con.execute("""CREATE TABLE IF NOT EXISTS d_tempo
                 AS SELECT * FROM time_df""")

  print('Time dimension created and inserted in the database')
  return True

In [6]:
# @title
def create_date_dimension_table():
  date_list = []
  start_date =  datetime(1950, 1, 1)
  end_date = datetime(2050, 12, 31)
  delta = timedelta(days=1)
  count = 0

  while start_date <= end_date:
    count += 1

    date_dict = {'data_id': count,
                'formatado': start_date.strftime('%Y-%m-%d'),
                'ano': start_date.year,
                'trimestre': format_date(start_date,'Q'),
                'nome_trimestre_en': format_date(start_date, format='QQQQ', locale='en_US'),
                'nome_trimestre_pt': format_date(start_date, format='QQQQ', locale='pt_BR'),
                'mes': start_date.month,
                'nome_mes_en': format_date(start_date, format='MMMM', locale='en_US'),
                'nome_mes_pt': format_date(start_date, format='MMMM', locale='pt_BR'),
                'dia': start_date.day,
                'nome_dia_en': format_date(start_date, format='EEEE', locale='en_US'),
                'nome_dia_pt': format_date(start_date, format='EEEE', locale='pt_BR'),
                'fim_de_semana': 1 if start_date.weekday() in [5, 6] else 0,
                'feriado': 1 if start_date.strftime('%Y-%m-%d') in holidays.Brazil() else 0
    }

    print(date_dict)

    start_date += delta
    date_list.append(date_dict)

  date_df = pd.DataFrame(date_list)
  con.execute("""CREATE TABLE IF NOT EXISTS d_data
                 AS SELECT * FROM date_df""")


ETL Processing - Extraction

In [8]:
def get_raw_data():
  #
  utils.download_data()

  # Adding files through a dict
  dfs_raw = {}

  file_list = glob.glob('data/*.csv')

  for csv_file in file_list:
    table_name = os.path.basename(csv_file).replace('.csv','')
    date_columns = date_mapping.get(table_name,None)

    df = pd.read_csv(csv_file, sep=',', parse_dates=date_columns, encoding='utf-8')

    # Metadata for extraction
    df['_load_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    df['_file_name'] = os.path.basename(csv_file)

    dfs_raw[table_name] = df

  return dfs_raw

dfs_raw = get_raw_data()

Downloading...
From: https://drive.google.com/uc?id=16jF2hJHVOwikmgoEKs9wARnrE_j6Y7e2
To: /content/data.zip
100%|██████████| 1.24M/1.24M [00:00<00:00, 154MB/s]


ETL - Cleaning

In [9]:
def cleaning_attributes(df):
    """
    Limpa um DataFrame dinamicamente baseando-se nas regras do JSON.
    """
    df_clean = df.copy()
    current_columns = set(df_clean.columns)

    for column, function_name in cleaning_mapping['multiple_attrs'].items():

        if column in current_columns:
            if hasattr(utils, function_name):
                func = getattr(utils, function_name)
                print(f"   -> Expanding column '{column}' usando '{function_name}'")

                # Aplica a função que retorna dict
                column_dicts = df_clean[column].apply(func)

                # Transforma a coluna de dicts em um novo DataFrame de colunas
                df_expanded = pd.json_normalize(column_dicts)

                # Concatena com o original (axis=1 coloca as colunas ao lado)
                df_clean = pd.concat([df_clean, df_expanded], axis=1)

                # (Opcional) Remove a coluna original suja para economizar espaço
                df_clean.drop(columns=[column], inplace=True)
            else:
                print(f"   ⚠️ Aviso: Função '' não encontrada")


    # print(f"⚙️ Processando tabela: {nome_tabela}")

    # # --- 1. Regras Simples (1 para 1) ---
    # # Ex: 'nome' -> clean_text(valor)
    # for coluna_alvo, nome_funcao in CONFIG_LIMPEZA['regras_simples'].items():

    #     if coluna_alvo in colunas_existentes:
    #         # A MÁGICA: getattr busca a função dentro de utils pelo nome (string)
    #         if hasattr(utils, nome_funcao):
    #             funcao_real = getattr(utils, nome_funcao)

    #             print(f"   -> Aplicando '{nome_funcao}' na coluna '{coluna_alvo}'")
    #             df_clean[coluna_alvo] = df_clean[coluna_alvo].apply(funcao_real)
    #         else:
    #             print(f"   ⚠️ Aviso: Função '{nome_funcao}' não encontrada em utils.py")

    # # --- 2. Regras de Expansão (1 para N) ---
    # # Ex: 'endereco' -> {rua: ..., cep: ...} -> Colunas novas


    return df_clean

In [14]:
cob = cleaning_attributes(dfs_raw['colaboradores'])
cob.head(5)

   -> Expanding column 'endereco' usando 'get_address_dict'
   ⚠️ Aviso: Função '' não encontrada


Unnamed: 0,cod_colaborador,primeiro_nome,ultimo_nome,email,cpf,data_nascimento,cep,_load_time,_file_name,numero,bairro,cep.1,cidade,estado
0,14,Paulo,Dias,melissalopes@example.net,847.210.695-02,1974-04-24,27275674,2025-12-22 22:32:01,colaboradores.csv,62.0,Zilah Sposito,37328273.0,Castro Paulista,PE
1,19,Luiz Fernando,Dias,pcunha@example.net,820.415.963-33,1994-07-10,50013962,2025-12-22 22:32:01,colaboradores.csv,82.0,Jardim Atlântico,27617762.0,Sales Verde,PB
2,32,Vitor Hugo,Dias,aragaonicolas@example.net,936.172.548-37,1986-12-18,08955-215,2025-12-22 22:32:01,colaboradores.csv,,,,N/D,NI
3,88,João Gabriel,Lima,luiz-otaviorezende@example.org,967.082.451-67,1994-10-19,24224-305,2025-12-22 22:32:01,colaboradores.csv,92.0,São Benedito,73615597.0,Moura da Mata,RR
4,39,Maria Eduarda,Melo,joao-pedro72@example.com,857.496.210-49,1996-03-01,96896390,2025-12-22 22:32:01,colaboradores.csv,65.0,Granja Werneck,67769059.0,da Rosa de da Cunha,RR
