In [None]:
# Necessário possuir a biblioteca DuckDB, holidays e babel instaladas.
!pip install duckdb
!pip install babel
!pip install holidays
!pip install gdown



Important! Google Drive ID for data source is required to execute this notebook. Please, put the ID in the variable below!

In [None]:
gdrive_link = ''

In [None]:
mport pandas as pd
import duckdb

# Libraries to create date dimension
from datetime import datetime, timedelta
from babel.dates import format_date, format_datetime, format_time
import holidays

# Auxiliar libraries
import re
import time
import gdown
import zipfile, os, glob

In [None]:
# Iniciando a conexão com o DuckDB.

con = duckdb.connect(database='database.duckdb', read_only=False)

Auxiliary funcions to cleaning data

In [None]:
# @title
# Clean extra spaces
def clean_text(text):
  return ' '.join(text.split()).strip()

# Normalize cep in NNNNNNNN format
def normalize_cep(cep):
    if cep is None or cep == '':
        return None

    # Convert to string and mantain only numbers
    cep = re.sub(r'[^0-9]', '', str(cep))

    if cep:
        return cep.zfill(8)

    return None

# Split address to many fields
def get_address_dict(address):
  address_dict: {
      'endereco': None,
      'numero': None,
      'bairro': None,
      'cep': None,
      'cidade': 'N/D',
      'estado': 'NI'
  }

  if address is None or address == '':
    return address_dict

  address_match = re.search(r"(.+?), (\d+) (.+) (\d{5}-?\d{3}) (.+) / ([A-Z]{2})", address)

  if address_match:
    address_dict['endereco'] = clean_text(address_match.group(1))
    address_dict['numero'] = clean_text(address_match.group(2))
    address_dict['bairro'] = clean_text(address_match.group(3))
    address_dict['cep'] = normalize_cep(address_match.group(4))
    address_dict['cidade'] = clean_text(address_match.group(5))
    address_dict['estado'] = clean_text(address_match.group(6))

  return address_dict

Creating time and date dimension tables.

In [None]:
# @title
def create_time_dimension_table():
  time_list = []
  time_of_day = ['Early Morning', 'Morning', 'Afternoon', 'Night']
  time_of_day_pt = ['Madrugada', 'Manhã', 'Tarde', 'Noite']

  for i in range(86400):
    min_sec = i % 3600

    time_dict = {'tempo_id': i+1,
                'hora': i // 3600,
                'minuto': min_sec // 60,
                'segundo': min_sec % 60
                }

    time_dict['periodo_en'] = time_of_day[time_dict['hora'] // 6]
    time_dict['periodo_pt'] = time_of_day_pt[time_dict['hora'] // 6]
    time_dict['formatado'] = f"{time_dict['hora']:02d}:{time_dict['minuto']:02d}:{time_dict['segundo']:02d}"

    time_list.append(time_dict)


  time_df = pd.DataFrame(time_list)
  con.execute("""CREATE TABLE IF NOT EXISTS d_tempo
                 AS SELECT * FROM time_df""")

  print('Time dimension created and inserted in the database')
  return True

In [None]:
# @title
def create_date_dimension_table():
  date_list = []
  start_date =  datetime(1950, 1, 1)
  end_date = datetime(2050, 12, 31)
  delta = timedelta(days=1)
  count = 0

  while start_date <= end_date:
    count += 1

    date_dict = {'data_id': count,
                'formatado': start_date.strftime('%Y-%m-%d'),
                'ano': start_date.year,
                'trimestre': format_date(start_date,'Q'),
                'nome_trimestre_en': format_date(start_date, format='QQQQ', locale='en_US'),
                'nome_trimestre_pt': format_date(start_date, format='QQQQ', locale='pt_BR'),
                'mes': start_date.month,
                'nome_mes_en': format_date(start_date, format='MMMM', locale='en_US'),
                'nome_mes_pt': format_date(start_date, format='MMMM', locale='pt_BR'),
                'dia': start_date.day,
                'nome_dia_en': format_date(start_date, format='EEEE', locale='en_US'),
                'nome_dia_pt': format_date(start_date, format='EEEE', locale='pt_BR'),
                'fim_de_semana': 1 if start_date.weekday() in [5, 6] else 0,
                'feriado': 1 if start_date.strftime('%Y-%m-%d') in holidays.Brazil() else 0
    }

    print(date_dict)

    start_date += delta
    date_list.append(date_dict)

  date_df = pd.DataFrame(date_list)
  con.execute("""CREATE TABLE IF NOT EXISTS d_data
                 AS SELECT * FROM date_df""")


ETL Processing - Extraction

In [None]:
def get_raw_data():
  #
  gdown.download(gdrive_link, output='data.zip', fuzzy=True, quiet=False)

  os.makedirs('data', exist_ok=True)

  with zipfile.ZipFile('data.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

  os.remove('data.zip')

  # Adding files through a dict
  dfs_raw = {}

  file_list = glob.glob('data/*.csv')

  for csv_file in file_list:
    df = pd.read_csv(csv_file, sep=';', encoding='latin-1')

    # Metadata for extraction
    df['_load_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    df['_file_name'] = os.path.basename(csv_file)

    dfs_raw[os.path.basename(csv_file).replace('.csv','')] = df

  return dfs_raw

get_raw_data().keys()





# def extract_data(filename):
#   # Extraindo dados
#   df = con.execute(f"SELECT * FROM '{filename}'").fetchdf()
#   return df

# ext_df = extract_data('data/clientes.csv')
# ext_df.head(5)

Downloading...
From: https://drive.google.com/uc?id=16jF2hJHVOwikmgoEKs9wARnrE_j6Y7e2
To: /content/data.zip
100%|██████████| 1.24M/1.24M [00:00<00:00, 108MB/s]


dict_keys(['agencias', 'propostas_credito', 'clientes', 'transacoes', 'colaboradores', 'colaborador_agencia', 'contas'])