In [12]:
!pip install duckdb duckdb-engine s3fs




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import duckdb
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
from faker import Faker
import random
import os
import pandas as pd

In [14]:
# Conectando ao MinIO via S3 (ajustar se usar local)
con = duckdb.connect(database=':memory:')

# Conectar ao Parquet no MinIO
# s3_path = "s3://bronze/transacoes_duck_bilhao.parquet"

# S3 config para DuckDB
con.execute("""
SET s3_endpoint='localhost:9000';
SET s3_region='us-east-1';
SET s3_access_key_id='admin';
SET s3_secret_access_key='password123';
SET s3_url_style='path';
SET s3_use_ssl=false;
""")

# Carrega Parquet
#df = con.execute(f"SELECT * FROM '{s3_path}'").fetchdf()
df = con.execute("SELECT * FROM 'transacoes_duck_bilhao.parquet'").fetchdf()

# ✅ Análises básicas
print("✔️ Contagem de linhas:", len(df))
print("\n✔️ Tipos das colunas:\n", df.dtypes)
print("\n✔️ Estatísticas descritivas:\n", df.describe())


✔️ Contagem de linhas: 1000000

✔️ Tipos das colunas:
 transaction_id                 int64
customer_id                    int64
transaction_date      datetime64[us]
transaction_amount           float64
merchant_category             object
payment_method                object
dtype: object

✔️ Estatísticas descritivas:
        transaction_id   customer_id            transaction_date  \
count  1000000.000000  1.000000e+06                     1000000   
mean    500000.500000  4.999361e+08  2024-07-21 16:28:11.481601   
min          1.000000  2.302000e+03         2023-07-23 00:00:00   
25%     250000.750000  2.499371e+08         2024-01-20 00:00:00   
50%     500000.500000  4.997303e+08         2024-07-21 00:00:00   
75%     750000.250000  7.498587e+08         2025-01-21 00:00:00   
max    1000000.000000  9.999963e+08         2025-07-22 00:00:00   
std     288675.278932  2.885898e+08                         NaN   

       transaction_amount  
count      1000000.000000  
mean          2504

In [19]:
# Simulando o MinIO localmente 
df = con.execute("SELECT * FROM 'transacoes_duck_bilhao.parquet'").fetchdf()

print(f"5 Primeiras linhas:\n\n {df.head(5)}\n{'-' * 60}") # Exibe as primeiras 5 linhas do DataFrame 

print(f"Total de linhas: {df.shape[0]}\n")  # retornar número de linhas

print(f"Tipos de dados: \n{df.dtypes}") # tipos de data frame

5 Primeiras linhas:

    transaction_id  customer_id transaction_date  transaction_amount  \
0               1    869709611       2025-07-17             3651.95   
1               2    187384068       2025-05-03             3404.76   
2               3    833050434       2023-12-05              452.18   
3               4    658914605       2024-12-16             3809.76   
4               5    597440453       2024-02-18              761.47   

       merchant_category     payment_method  
0          Casa e Jardim                PIX  
1  Serviços de Streaming           Dinheiro  
2                Viagens   Cartão de Débito  
3            Restaurante   Cartão de Débito  
4   Posto de Combustível  Cartão de Crédito  
------------------------------------------------------------
Total de linhas: 1000000

Tipos de dados: 
transaction_id                 int64
customer_id                    int64
transaction_date      datetime64[us]
transaction_amount           float64
merchant_category      

In [16]:
def calculate_stats(name):
    stats = {
        "avg": df[name].mean(),
        "min": df[name].min(),
        "max": df[name].max(),
        "stddev_pop": df[name].std(ddof=0),
        "count": df[name].shape[0]
    }

    print(f"Estatísticas para {name}:\n{stats}\n{'-' * 60}")

In [18]:
# Operações básicas
column_names = df.columns.tolist()

print("Colunas disponíveis:", column_names)

calculate_stats('transaction_amount')

Colunas disponíveis: ['transaction_id', 'customer_id', 'transaction_date', 'transaction_amount', 'merchant_category', 'payment_method']
Estatísticas para transaction_amount:
{'avg': np.float64(2504.9075683200003), 'min': np.float64(5.01), 'max': np.float64(5000.0), 'stddev_pop': np.float64(1441.3662940983331), 'count': 1000000}
------------------------------------------------------------
