In [1]:
# ==========================================================
# 03_data_processing.ipynb
# Cria objetos PyG e datasets de treino/teste
# ==========================================================

# 0) Project setup (path fix for notebooks)
import sys
from pathlib import Path

# Detect project root (works in notebooks and scripts)
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd().parents[0]  # assumes this notebook lives in /notebooks

# Add project root to sys.path so `src` can be imported
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

print(f"üìÅ Project root detected as: {ROOT.resolve()}")


üìÅ Project root detected as: /Users/leonardoribeiro/Documents/DataScience/MBA_USP/TCC


In [2]:
# 1) Imports
from pathlib import Path
import pandas as pd

from src.utils import (
    ConfigLoader,
    EnvironmentSetup,
    GraphBuilder,
    DataSplitter,
)


In [3]:
# 2) Load configuration and set environment
cfg = ConfigLoader.load("base.yaml")

seed = cfg.get("general", {}).get("seed", 42)
env = EnvironmentSetup(seed=seed)
print(f"‚úÖ Environment initialized with seed={seed}")


‚úÖ Active device: cpu
GPU detected: None
Torch version: 2.8.0
‚úÖ Environment initialized with seed=42


In [4]:
# 3) Load processed CSVs
data_proc = ROOT / cfg["paths"]["data_processed"]

nodes_with_class_path = data_proc / "elliptic_nodes_with_class.csv"
edges_path            = data_proc / "elliptic_edges.csv"
target_path           = data_proc / "elliptic_target.csv"  # (opcional aqui)

print("üîç Looking for processed files in:")
print(f"   - {data_proc.resolve()}")

df_nodes_with_class = pd.read_csv(nodes_with_class_path)
df_edges            = pd.read_csv(edges_path)
df_target           = pd.read_csv(target_path)  # opcional

print("‚úÖ Loaded:")
print(f"   - Nodes with class: {df_nodes_with_class.shape}")
print(f"   - Edges:            {df_edges.shape}")
print(f"   - Target:           {df_target.shape}")


üîç Looking for processed files in:
   - /Users/leonardoribeiro/Documents/DataScience/MBA_USP/TCC/data/processed
‚úÖ Loaded:
   - Nodes with class: (203769, 168)
   - Edges:            (234355, 2)
   - Target:           (203769, 2)


In [5]:
# 4) Build PyG graphs (GraphBuilder)
# All feature columns start with "feature_"
feature_cols = [c for c in df_nodes_with_class.columns if c.startswith("feature_")]

# Destination for elliptic_graphs.pt
output_dir = data_proc

# Optional: read exclusion list from YAML if present
exclude_steps = cfg.get("splits", {}).get("exclude_steps", [])

builder = GraphBuilder(
    df_nodes=df_nodes_with_class,
    df_edges=df_edges,
    feature_cols=feature_cols,
    output_dir=output_dir,
    exclude_steps=exclude_steps,
)

# Build and save PyG graphs
builder.run(filename="elliptic_graphs.pt")
graphs = builder.graphs

print(f"‚úÖ {len(graphs)} graphs created and saved to {(output_dir / 'elliptic_graphs.pt').resolve()}")


‚úÖ Created 49 graphs (time_steps 1‚Äì49)
üíæ Saved 49 graphs to /Users/leonardoribeiro/Documents/DataScience/MBA_USP/TCC/data/processed/elliptic_graphs.pt
‚úÖ 49 graphs created and saved to /Users/leonardoribeiro/Documents/DataScience/MBA_USP/TCC/data/processed/elliptic_graphs.pt


In [6]:
# 5) Split data into train/test and save CSVs (DataSplitter)
splitter = DataSplitter(cfg, df_nodes_with_class)

# ensure splitter uses absolute data_processed path when running from /notebooks
splitter.data_processed = data_proc.resolve()

df_train, df_test1, df_test2, df_test = splitter.run(save_csv=True)

print("\n‚úÖ Data splitting and CSV export complete!")


‚úÖ Loaded 49 PyG graphs from /Users/leonardoribeiro/Documents/DataScience/MBA_USP/TCC/data/processed/elliptic_graphs.pt
Train graphs: 34 (1‚Äì34)
Test1 graphs: 8 (35‚Äì42)
Test2 graphs: 7 (43‚Äì49)
Train nodes: 29894 | Test1: 9983 | Test2: 6687 | Total labeled: 46564
üíæ Saved CSV splits to /Users/leonardoribeiro/Documents/DataScience/MBA_USP/TCC/data/processed/splits

‚úÖ Data splitting and CSV export complete!


In [7]:
# 6) Quick verification / summary
splits_dir = (data_proc / "splits").resolve()

print("\nüìä Summary:")
print(f" - Train nodes: {len(df_train)}")
print(f" - Test1 nodes: {len(df_test1)}")
print(f" - Test2 nodes: {len(df_test2)}")
print(f" - Total labeled nodes: {len(df_train) + len(df_test1) + len(df_test2)}")
print(f"\nüìÇ Split CSVs saved under: {splits_dir}")
print("   - train_nodes.csv")
print("   - test1_nodes.csv")
print("   - test2_nodes.csv")

print("\nüöÄ Processing completed successfully!")



üìä Summary:
 - Train nodes: 29894
 - Test1 nodes: 9983
 - Test2 nodes: 6687
 - Total labeled nodes: 46564

üìÇ Split CSVs saved under: /Users/leonardoribeiro/Documents/DataScience/MBA_USP/TCC/data/processed/splits
   - train_nodes.csv
   - test1_nodes.csv
   - test2_nodes.csv

üöÄ Processing completed successfully!
