In [2]:
import pyarrow.parquet as pq
import pyarrow
import pyarrowfs_adlgen2
from dotenv import load_dotenv
import os

load_dotenv()


ADLS_NAME = os.getenv("ADLS_NAME")
ADLS_KEY =  os.getenv("ADLS_KEY")
directory = "sources"  # ou ajuste conforme necessário

import json
import os
from tqdm import tqdm
json_path = "parquet_row_counts.json"

# Carrega o dicionário existente ou cria um novo
if os.path.exists(json_path):
    with open(json_path, "r") as f:
        row_counts = json.load(f)
else:
    row_counts = {}


from azure.storage.blob import BlobServiceClient

# Create the container client
account_url = f"https://{ADLS_NAME}.blob.core.windows.net"
credential = ADLS_KEY
container_name = "taxi"  # Change if your container name is different

handler = pyarrowfs_adlgen2.AccountHandler.from_account_name(ADLS_NAME, ADLS_KEY)
fs = pyarrow.fs.PyFileSystem(handler)

blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
container_client = blob_service_client.get_container_client(container_name)

blobs = container_client.list_blobs(name_starts_with='source/')
for blob in tqdm(blobs):
    if blob.name.endswith('.parquet') and blob.name not in row_counts:
        pf = pq.ParquetFile(f"taxi/{blob.name}", filesystem=fs)
        row_counts[blob.name] = pf.metadata.num_rows


# Salva o dicionário atualizado
with open(json_path, "w") as f:
    json.dump(row_counts, f, indent=2)

total_rows = sum(row_counts.values())
print(f"Total de linhas em todos os arquivos parquet: {total_rows}")


578it [00:01, 334.34it/s]

Total de linhas em todos os arquivos parquet: 3945621967





In [None]:
from collections import defaultdict
import json
import re

def extract_mm_yyyy(fname):
    # Exemplo: source/yellow/2025/yellow_tripdata_2025-03.parquet
    match = re.search(r'(\d{4})-(\d{2})', fname)
    if match:
        return f"{match.group(2)}-{match.group(1)}"
    return fname

yellow_schemas = {}
yellow_schema_diffs = []

# Percorra todos os arquivos do yellow_trip
for blob in container_client.list_blobs(name_starts_with='source/yellow/'):
    if blob.name.endswith('.parquet'):
        pf = pq.ParquetFile(f"taxi/{blob.name}", filesystem=fs)
        schema = pf.schema.to_arrow_schema()
        yellow_schemas[blob.name] = schema

# Ordena os arquivos por nome (que contém ano/mês)
sorted_files = sorted(yellow_schemas.keys())

prev_schema = None
prev_fname = None
for fname in sorted_files:
    schema = yellow_schemas[fname]
    if prev_schema is not None and schema != prev_schema:
        added = [f for f in schema.names if f not in prev_schema.names]
        removed = [f for f in prev_schema.names if f not in schema.names]
        type_changes = []
        for col in set(schema.names).intersection(prev_schema.names):
            prev_type = str(prev_schema.field(col).type)
            curr_type = str(schema.field(col).type)
            if prev_type != curr_type:
                type_changes.append({
                    "column": col,
                    "from": prev_type,
                    "to": curr_type
                })
        yellow_schema_diffs.append({
            "from": extract_mm_yyyy(prev_fname),
            "to": extract_mm_yyyy(fname),
            "added_columns": added,
            "removed_columns": removed,
            "type_changes": type_changes
        })
    prev_schema = schema
    prev_fname = fname

# Salva as diferenças em um arquivo JSON simples
with open("schema_evolution//yellow_schema_diffs_simple.json", "w", encoding="utf-8") as f:
    json.dump(yellow_schema_diffs, f, indent=2, ensure_ascii=False)


In [6]:
import json
from collections import Counter

with open("parquet_row_counts.json", "r") as f:
    count = f.read()


    data = json.loads(count)


    yellow_counts_by_year = Counter()
    for k in data:
        if k.startswith("source/yellow/2014"):
            yellow_counts_by_year["2014"] += 1
        elif k.startswith("source/yellow/2015"):
            yellow_counts_by_year["2015"] += 1
        elif k.startswith("source/yellow/2016"):
            yellow_counts_by_year["2016"] += 1
        elif k.startswith("source/yellow/2017"):
            yellow_counts_by_year["2017"] += 1
        elif k.startswith("source/yellow/2018"):
            yellow_counts_by_year["2018"] += 1
        elif k.startswith("source/yellow/2019"):
            yellow_counts_by_year["2019"] += 1
        elif k.startswith("source/yellow/2020"):
            yellow_counts_by_year["2020"] += 1
        elif k.startswith("source/yellow/2021"):
            yellow_counts_by_year["2021"] += 1
        elif k.startswith("source/yellow/2022"):
            yellow_counts_by_year["2022"] += 1
        elif k.startswith("source/yellow/2023"):
            yellow_counts_by_year["2023"] += 1
        elif k.startswith("source/yellow/2024"):
            yellow_counts_by_year["2024"] += 1
        elif k.startswith("source/yellow/2025"):
            yellow_counts_by_year["2025"] += 1

    total_yellow = sum(yellow_counts_by_year.values())
    print(f"Total source/yellow de 2014 até 2025: {total_yellow}")
    print("Quantidade por ano:")
    for year in sorted(yellow_counts_by_year):
        print(f"{year}: {yellow_counts_by_year[year]}")

Total source/yellow de 2014 até 2025: 131
Quantidade por ano:
2014: 8
2015: 12
2016: 12
2017: 12
2018: 12
2019: 12
2020: 12
2021: 12
2022: 12
2023: 12
2024: 12
2025: 3


In [9]:
def get_yellow_taxi_data(year_month, limit=None):
    """
    Busca e retorna o dataset do yellow taxi para um determinado ano-mês do ADLS.
    
    Args:
        year_month (str): Formato 'YYYY-MM' (ex: '2013-05')
        limit (int, optional): Número máximo de linhas a retornar. Se None, retorna todas.
    
    Returns:
        pandas.DataFrame: Dataset do yellow taxi
    """
    # Constrói o caminho do arquivo
    file_path = f"source/yellow/{year_month[:4]}/yellow_tripdata_{year_month}.parquet"
    
    # Lê o arquivo parquet do ADLS
    pf = pq.ParquetFile(f"taxi/{file_path}", filesystem=fs)
    
    if limit:
        # Lê apenas as primeiras 'limit' linhas
        df = pf.read(use_pandas_metadata=True).slice(0, limit).to_pandas()
    else:
        # Lê o arquivo completo
        df = pf.read().to_pandas()
    
    return df

# Exemplo de uso: buscar dados de maio de 2013 (primeiras 1000 linhas)
yellow_2013_05 = get_yellow_taxi_data('2018-05')
print(f"Shape do dataset: {yellow_2013_05.shape}")
print(f"Colunas: {list(yellow_2013_05.columns)}")
print(f"\nPrimeiras 5 linhas:")
yellow_2013_05.head()

Shape do dataset: (9224788, 19)
Colunas: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee']

Primeiras 5 linhas:


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2018-05-01 00:13:56,2018-05-01 00:22:46,1,1.6,1,N,230,50,1,8.0,0.5,0.5,1.85,0.0,0.3,11.15,,
1,1,2018-05-01 00:23:26,2018-05-01 00:29:56,1,1.7,1,N,263,239,1,7.5,0.5,0.5,2.0,0.0,0.3,10.8,,
2,1,2018-05-01 00:36:23,2018-05-01 00:48:26,2,2.6,1,N,239,152,1,12.0,0.5,0.5,1.0,0.0,0.3,14.3,,
3,1,2018-05-01 00:26:12,2018-05-01 00:27:05,1,0.0,1,N,145,145,1,2.5,0.5,0.5,9.63,0.0,0.3,13.43,,
4,1,2018-05-01 00:29:51,2018-05-01 00:30:02,1,0.0,1,N,145,145,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,,


In [12]:
len(yellow_2013_05)

9224788

In [57]:
total_mem = 0

# Recrie o iterador de blobs
blobs = container_client.list_blobs(name_starts_with='source/')

for blob in blobs:
    # print(blob.name)
    total_mem += blob.size

print(f"Total de memória em todos os arquivos parquet: {total_mem/(1024)**3:2f} GBs")


Total de memória em todos os arquivos parquet: 34.506272 GBs
