# Performance das Consultas

## Importando Bibliotecas

In [1]:
import duckdb
import time

## Conectando à base de dados

In [2]:
cursor = duckdb.connect()
DATABASE = '*.parquet'

In [14]:
def execute_query_and_calculate_time(cursor, query, return_df=False):
    
    if return_df:
        tic = time.time()
        cursor.execute(query)
        df = cursor.df()
        toc = time.time()
        return df, toc - tic
    else:
        tic = time.time()
        cursor.execute(query)
        toc = time.time()
        return toc - tic

## Consultas

### Quantidade de Registros

In [3]:
query = f"SELECT COUNT(*) FROM '{DATABASE}'"

In [4]:
tic = time.time()

cursor.execute(query)
n_rows = cursor.fetchone()

toc = time.time()

print(f"Number of rows: {n_rows[0]:,}")
print(f"Time: {toc - tic:.2f}s")

Number of rows: 4,283,329,488
Time: 1.47s


### Primeiros Registros

In [6]:
query = f"""
    SELECT 
        *
    FROM '{DATABASE}' LIMIT 5
"""

tic = time.time()
cursor.execute(query)
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")
df_result

Time: 0.82s


Unnamed: 0,event_timestamp,event_type,some_id,event_system,event_description,event_id,filename
0,2022-10-26 10:39:36,INFO,67305985,LOGD,Início das operações do logd,E2C58C3021D6DB87,/data/logs/2_AC/o00407-0100700090001_new.csv
1,2022-10-26 10:39:36,INFO,67305985,LOGD,Urna ligada em 26/10/2022 às 10:38:20,DFBD462E26E8F1EA,/data/logs/2_AC/o00407-0100700090001_new.csv
2,2022-10-26 10:39:36,INFO,67305985,SCUE,Iniciando aplicação - Oficial - 1º turno,B8E2CBFADB3EF46B,/data/logs/2_AC/o00407-0100700090001_new.csv
3,2022-10-26 10:39:36,INFO,67305985,SCUE,Versão da aplicação: 8.26.0.0 - Onça-pintada,AC76A5B17419CB2E,/data/logs/2_AC/o00407-0100700090001_new.csv
4,2022-10-26 10:39:38,INFO,67305985,SCUE,Urna operando com rede elétrica,ED0703CBF6110D2C,/data/logs/2_AC/o00407-0100700090001_new.csv


### Primeiros registros + filtro RN

In [7]:
query = f"""
    SELECT 
        *
    FROM '{DATABASE}'
    WHERE filename ILIKE '%RN%'
    LIMIT 500
"""

tic = time.time()
cursor.execute(query)
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

Time: 69.65s


In [11]:
query = f"""
    SELECT 
        *
    FROM '{DATABASE}'
    WHERE filename ILIKE '%SP%'
    LIMIT 500
"""

tic = time.time()
cursor.execute(query)
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

Time: 91.12s


### Distinct

event_type

In [12]:
query = f"""
    SELECT DISTINCT
        event_type
    FROM '{DATABASE}'
"""

tic = time.time()
cursor.execute(query)
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

Time: 5.69s


event_description

In [14]:
query = f"""
    SELECT DISTINCT
        event_description
    FROM '{DATABASE}'
"""

tic = time.time()
cursor.execute(query)
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

Time: 29.33s


### Group By

In [3]:
query = f"""
    SELECT 
        event_system,
        COUNT(*) AS qtd_linhas
    FROM '{DATABASE}'
    GROUP BY event_system
"""

tic = time.time()
cursor.execute(query) 
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Time: 6.77s


In [4]:
df_result

Unnamed: 0,event_system,qtd_linhas
0,INITJE,3044304
1,VERIFICADOR,37931
2,STE,394
3,LOGD,17978454
4,ADH,5188
5,SA,784
6,SCUE,39756883
7,VPP,223388
8,VOA,1
9,VOTA,3879701660


### Group By + Filtro

In [9]:
query = f"""
    SELECT 
        event_type,
        COUNT(*) AS qtd_linhas
    FROM '{DATABASE}'
    WHERE event_system='VOTA' OR event_system='RED'
    GROUP BY event_type
"""

tic = time.time()
cursor.execute(query) 
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Time: 7.98s


In [10]:
df_result

Unnamed: 0,event_type,qtd_linhas
0,ALERTA,50460553
1,ERRO,1024682
2,INFO,3828293116


### Verificar se event_id é unico

[WIP] Descrição básica da razão de cada consulta, qual sua função e como ela é utilizada no dia a dia

1 - Usando GroupBy

In [4]:
query = f"""
    SELECT
        COUNT(*) 
    FROM (
        SELECT 
            event_id,
            COUNT(*)
        FROM '{DATABASE}'
        GROUP BY event_id
        HAVING COUNT(*) > 1
    )
"""

tic = time.time()
cursor.execute(query) 
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

: 

2 - Usando Windows Function

In [3]:
query = f"""
    SELECT
        event_id, qtd_linhas
    FROM (
        SELECT 
            event_id,
            COUNT(*) OVER( PARTITION BY event_id ) AS qtd_linhas
        FROM '{DATABASE}'
    ) _
    WHERE qtd_linhas > 1
    LIMIT 1
"""

tic = time.time()
cursor.execute(query) 
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

3 - Usando Count Distinct

In [5]:
query = f"""
    SELECT COUNT(*)
    FROM (
        SELECT DISTINCT event_id 
        FROM '{DATABASE}'
    ) _
"""

tic = time.time()
cursor.execute(query) 
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

4 - Usando Distinct + write to disk

In [4]:
query = f"""
    COPY (
        SELECT DISTINCT event_id 
        FROM '{DATABASE}'
    ) TO 'event_id.parquet' 
    (FORMAT 'parquet')
"""

tic = time.time()
cursor.execute(query) 
df_result = cursor.df()
toc = time.time()

print(f"Time: {toc - tic:.2f}s")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

: 

### Distinct mensagens

Primeira aproximação

In [15]:
query = f"""
    SELECT DISTINCT
        regexp_replace(event_description, '[0-9]', 'X', 'g') AS event_description
    FROM '{DATABASE}'
"""

duration, df_result = execute_query_and_calculate_time(cursor, query, return_df=True)
print(f"Time: {toc - tic:.2f}s")
df_result

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Time: 159.81s


159.71511435508728

In [22]:
# Identificador da mídia de carga
# Serial da MI copiada da MV da urna original
# Serial de votação da MV
# Verificação de assinatura de dado por etapa

query = f"""
    SELECT DISTINCT
        CASE
            WHEN event_description ILIKE 'Identificador da mídia de carga%' 
            THEN 'Identificador da mídia de carga'

            WHEN event_description ILIKE 'Serial da MI copiada da MV da urna original%' 
            THEN 'Serial da MI copiada da MV da urna original'

            WHEN event_description ILIKE 'Serial de votação da MV%' 
            THEN 'Serial de votação da MV'

            WHEN event_description ILIKE 'Verificação de assinatura de dado por etapa%' 
            THEN 'Verificação de assinatura de dado por etapa'

            WHEN event_description ILIKE 'Número de série da MR%'
            THEN 'Número de série da MR'
            
            ELSE regexp_replace(event_description, '[0-9]', 'X', 'g') 
        END AS event_description
    FROM '{DATABASE}'
"""

df_result, duration = execute_query_and_calculate_time(cursor, query, return_df=True)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [23]:
print(f"Time: {duration:.2f}s")
print(f"Number of rows: {df_result.shape[0]:,}")
df_result.to_csv('event_description.csv', index=False)

Time: 478.24s
Number of rows: 1,391
