In [5]:
from glob import glob
import os
import json
from pathlib import Path
from collections import defaultdict

from EmbeddingsConstruction.Training.Node.TrainNodeEncoder import NodeEncoderTrainer
from EmbeddingsConstruction.Training.Conference.TrainConferenceEncoder import ConferenceEncoderTrainer

In [6]:
base_data = Path("../conferences_data")

base_companies = base_data / "companies"
base_processed = base_data / "processed_companies"

json_paths = glob(f"{base_processed}/**/*.json", recursive=True)

In [7]:
best_weights_node_encoder_path = "./EmbeddingsConstruction/weights/node_encoder_best.pt"
best_weights_conference_encoder_path = "./EmbeddingsConstruction/weights/conference_encoder_best.pt"
device = "cuda:0"
optuna_epochs = 10
training_epochs = 100

# Conferences data

In [8]:
faltantes = defaultdict(list)
total_conferencias_procesables = 0  # 👈 Nuevo contador

for root, _, files in os.walk(base_companies):
    if "LEVEL_4.json" in files:
        json_path = os.path.join(root, "LEVEL_4.json")

        try:
            with open(json_path, 'r') as f:
                data = json.load(f)
        except Exception as e:
            print(f"❌ Error leyendo {json_path}: {e}")
            continue

        if data.get("questions_and_answers"):  # Solo si existe y no está vacío
            total_conferencias_procesables += 1  # 👈 Aumentamos el total

            relative_path = os.path.relpath(root, base_companies)
            parts = relative_path.split(os.sep)

            if len(parts) >= 3:
                empresa, año, quarter = parts[:3]
                transcript_path = os.path.join(base_processed, empresa, año, quarter, "transcript.json")
                if not os.path.exists(transcript_path):
                    faltantes[empresa].append((año, quarter))


# Mostrar resultados agrupados por empresa
# for empresa in sorted(faltantes.keys()):
#     print(f"\n🔹 Empresa: {empresa}")
#     conferencias = sorted(faltantes[empresa])
#     for año, quarter in conferencias:
#         print(f"  - {año} {quarter}")

# Mostrar resumen
total_faltantes = sum(len(v) for v in faltantes.values())
print(f"\n📊 Total de conferencias procesables encontradas: {total_conferencias_procesables}")
print(f"❗ Total de conferencias faltantes: {total_faltantes}")
if total_conferencias_procesables > 0:
    porcentaje = (total_faltantes / total_conferencias_procesables) * 100
    print(f"📉 Porcentaje de conferencias aún no procesadas: {porcentaje:.2f}%")
else:
    print("⚠️ No se encontraron conferencias con 'questions_and_answers'.")


📊 Total de conferencias procesables encontradas: 3689
❗ Total de conferencias faltantes: 282
📉 Porcentaje de conferencias aún no procesadas: 7.64%


In [9]:
total_intervenciones = 0
conferencias_con_intervenciones = 0

for root, _, files in os.walk(base_companies):
    if "LEVEL_4.json" in files:
        path_l4 = os.path.join(root, "LEVEL_4.json")

        try:
            with open(path_l4, 'r') as f:
                data_l4 = json.load(f)
        except Exception as e:
            print(f"❌ Error leyendo {path_l4}: {e}")
            continue

        if data_l4.get("questions_and_answers"):  # solo si es procesable
            path_l3 = os.path.join(root, "LEVEL_3.json")
            if os.path.exists(path_l3):
                try:
                    with open(path_l3, 'r') as f:
                        data_l3 = json.load(f)
                    speakers = data_l3.get("speakers", [])
                    num_intervenciones = len(speakers)
                    total_intervenciones += num_intervenciones
                    conferencias_con_intervenciones += 1
                    # print(f"📄 {path_l3}: {num_intervenciones} intervenciones")
                except Exception as e:
                    print(f"⚠️ Error leyendo/parsing {path_l3}: {e}")
            else:
                print(f"🟡 Falta LEVEL_3.json en: {root}")

print("\n📊 Resumen:")
print(f"✅ Conferencias con LEVEL_3.json: {conferencias_con_intervenciones}")
print(f"🧠 Intervenciones totales (len(speakers)): {total_intervenciones}")


📊 Resumen:
✅ Conferencias con LEVEL_3.json: 3689
🧠 Intervenciones totales (len(speakers)): 260512


# Node Encoder Training

In [10]:
trainer = NodeEncoderTrainer(
    json_paths,          # lista de rutas JSON con los árboles de conferencia
    input_dim=21,        # 21 si concatenas texto+audio+video por defecto
    save_path=best_weights_node_encoder_path,  # ruta donde guardarás los pesos entrenados
    device=device,       # o "cpu"
    optuna_epochs=optuna_epochs,     # cuántas épocas usar durante cada trial de Optuna
    final_epochs=training_epochs     # cuántas épocas usar en el entrenamiento final
)

In [11]:
trainer.optimize()

[I 2025-06-04 09:32:03,448] A new study created in memory with name: no-name-9a6babee-e024-4a96-a29d-328087ce491b
[I 2025-06-04 09:37:27,851] Trial 0 finished with value: 2.442218182302554 and parameters: {'hidden_dim': 64, 'n_heads': 4, 'dropout': 0.19047260530306614, 'lr': 1.3892039266445554e-05}. Best is trial 0 with value: 2.442218182302554.
[I 2025-06-04 09:43:05,688] Trial 1 finished with value: 2.3455738063695435 and parameters: {'hidden_dim': 64, 'n_heads': 8, 'dropout': 0.20829058600404826, 'lr': 0.00016816772287060137}. Best is trial 1 with value: 2.3455738063695435.
[I 2025-06-04 09:48:43,992] Trial 2 finished with value: 2.4679312715042387 and parameters: {'hidden_dim': 128, 'n_heads': 8, 'dropout': 0.28728215819278563, 'lr': 5.076236395489608e-05}. Best is trial 1 with value: 2.3455738063695435.
[I 2025-06-04 09:54:18,880] Trial 3 finished with value: 2.338072753142911 and parameters: {'hidden_dim': 64, 'n_heads': 4, 'dropout': 0.1186852455355762, 'lr': 2.4088979920288448e

🏆 Best params: {'hidden_dim': 128, 'n_heads': 8, 'dropout': 0.11584743008890555, 'lr': 0.000404801732153434}


{'hidden_dim': 128,
 'n_heads': 8,
 'dropout': 0.11584743008890555,
 'lr': 0.000404801732153434}

In [12]:
trainer.train()

Epoch 1/100 - Loss: 2.3412
Epoch 2/100 - Loss: 2.2794
Epoch 3/100 - Loss: 2.2707
Epoch 4/100 - Loss: 2.2648
Epoch 5/100 - Loss: 2.2554
Epoch 6/100 - Loss: 2.2506
Epoch 7/100 - Loss: 2.2494
Epoch 8/100 - Loss: 2.2489
Epoch 9/100 - Loss: 2.2478
Epoch 10/100 - Loss: 2.2466
Epoch 11/100 - Loss: 2.2449
Epoch 12/100 - Loss: 2.2436
Epoch 13/100 - Loss: 2.2441
Epoch 14/100 - Loss: 2.2421
Epoch 15/100 - Loss: 2.2401
Epoch 16/100 - Loss: 2.2404
Epoch 17/100 - Loss: 2.2399
Epoch 18/100 - Loss: 2.2397
Epoch 19/100 - Loss: 2.2406
Epoch 20/100 - Loss: 2.2396
Epoch 21/100 - Loss: 2.2396
Epoch 22/100 - Loss: 2.2408
Epoch 23/100 - Loss: 2.2393
Epoch 24/100 - Loss: 2.2385
Epoch 25/100 - Loss: 2.2392
Epoch 26/100 - Loss: 2.2380
Epoch 27/100 - Loss: 2.2384
Epoch 28/100 - Loss: 2.2380
Epoch 29/100 - Loss: 2.2377
Epoch 30/100 - Loss: 2.2387
Epoch 31/100 - Loss: 2.2384
Epoch 32/100 - Loss: 2.2368
Epoch 33/100 - Loss: 2.2362
Epoch 34/100 - Loss: 2.2380
Epoch 35/100 - Loss: 2.2366
Epoch 36/100 - Loss: 2.2362
E

SentenceAttentionEncoder(
  (input_proj): Linear(in_features=21, out_features=128, bias=True)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (dropout): Dropout(p=0.11584743008890555, inplace=False)
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
)

# Conference Encoder Train

In [13]:
trainer = ConferenceEncoderTrainer(
    json_paths=json_paths,
    sentence_encoder_path=best_weights_node_encoder_path,
    save_path=best_weights_conference_encoder_path,
    device=device,
    optuna_epochs=optuna_epochs,
    final_epochs=training_epochs
)

In [14]:
trainer.optimize()

[I 2025-06-04 12:06:36,410] A new study created in memory with name: no-name-24f0718a-d986-442f-8b46-4272e8127003


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 12:34:13,108] Trial 0 finished with value: 2.079107894164296 and parameters: {'hidden_dim_conf': 256, 'n_heads': 2, 'lr': 0.00036248638953566265}. Best is trial 0 with value: 2.079107894164296.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 13:01:48,557] Trial 1 finished with value: 1.1010980451470809 and parameters: {'hidden_dim_conf': 128, 'n_heads': 4, 'lr': 9.320512644032831e-05}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 13:29:30,985] Trial 2 finished with value: 1.1166038993098926 and parameters: {'hidden_dim_conf': 1024, 'n_heads': 8, 'lr': 2.0657084453575286e-05}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 13:57:04,667] Trial 3 finished with value: 1.128209444068967 and parameters: {'hidden_dim_conf': 256, 'n_heads': 2, 'lr': 6.272392073904032e-05}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 14:24:39,274] Trial 4 finished with value: 1.1356513275646827 and parameters: {'hidden_dim_conf': 256, 'n_heads': 8, 'lr': 1.4418734948489936e-05}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 14:30:12,519] Trial 5 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 14:35:44,976] Trial 6 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 14:38:30,894] Trial 7 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 14:41:16,005] Trial 8 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 14:44:02,284] Trial 9 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 15:11:38,990] Trial 10 finished with value: 1.1071252286993842 and parameters: {'hidden_dim_conf': 512, 'n_heads': 4, 'lr': 4.2999781090211844e-05}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 15:17:09,680] Trial 11 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 15:22:41,193] Trial 12 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 15:25:27,699] Trial 13 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 15:52:58,193] Trial 14 finished with value: 1.1092923156952075 and parameters: {'hidden_dim_conf': 128, 'n_heads': 4, 'lr': 4.397306443080333e-05}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 15:55:42,190] Trial 15 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 15:58:26,359] Trial 16 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 16:06:38,239] Trial 17 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 16:33:59,364] Trial 18 finished with value: 1.112582817253932 and parameters: {'hidden_dim_conf': 128, 'n_heads': 4, 'lr': 0.0001775760546119867}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 16:36:43,595] Trial 19 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 16:39:27,298] Trial 20 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 16:42:13,047] Trial 21 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 16:44:58,318] Trial 22 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 16:47:44,478] Trial 23 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 17:15:25,229] Trial 24 finished with value: 1.1071515415615878 and parameters: {'hidden_dim_conf': 128, 'n_heads': 4, 'lr': 0.00013594146382617687}. Best is trial 1 with value: 1.1010980451470809.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 17:18:11,132] Trial 25 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 17:45:46,442] Trial 26 finished with value: 1.0975403647887314 and parameters: {'hidden_dim_conf': 256, 'n_heads': 4, 'lr': 8.055227369493488e-05}. Best is trial 26 with value: 1.0975403647887314.


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 17:48:31,093] Trial 27 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 17:51:15,804] Trial 28 pruned. 


✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt


[I 2025-06-04 17:54:00,319] Trial 29 pruned. 


🏆 Mejores hiperparámetros: {'hidden_dim_conf': 256, 'n_heads': 4, 'lr': 8.055227369493488e-05}


{'hidden_dim_conf': 256, 'n_heads': 4, 'lr': 8.055227369493488e-05}

In [15]:
trainer.train()

✅ Pesos cargados desde: ./EmbeddingsConstruction/weights/node_encoder_best.pt
Epoch 1/100 - Loss: 1.2894
Epoch 2/100 - Loss: 1.1715
Epoch 3/100 - Loss: 1.1463
Epoch 4/100 - Loss: 1.1324
Epoch 5/100 - Loss: 1.1238
Epoch 6/100 - Loss: 1.1164
Epoch 7/100 - Loss: 1.1128
Epoch 8/100 - Loss: 1.1108
Epoch 9/100 - Loss: 1.1051
Epoch 10/100 - Loss: 1.0998
Epoch 11/100 - Loss: 1.0955
Epoch 12/100 - Loss: 1.1024
Epoch 13/100 - Loss: 1.0923
Epoch 14/100 - Loss: 1.0943
Epoch 15/100 - Loss: 1.0900
Epoch 16/100 - Loss: 1.0866
Epoch 17/100 - Loss: 1.0867
Epoch 18/100 - Loss: 1.0929
Epoch 19/100 - Loss: 1.0851
Epoch 20/100 - Loss: 1.0920
Epoch 21/100 - Loss: 1.0846
Epoch 22/100 - Loss: 1.0869
Epoch 23/100 - Loss: 1.0866
Epoch 24/100 - Loss: 1.0842
Epoch 25/100 - Loss: 1.0832
Epoch 26/100 - Loss: 1.0824
Epoch 27/100 - Loss: 1.0801
Epoch 28/100 - Loss: 1.0812
Epoch 29/100 - Loss: 1.0811
Epoch 30/100 - Loss: 1.0801
Epoch 31/100 - Loss: 1.0830
Epoch 32/100 - Loss: 1.0836
Epoch 33/100 - Loss: 1.0801
Epoch 3

ConferenceEncoder(
  (pos_embedding): Embedding(101, 512)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=512, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (proj): Linear(in_features=512, out_features=512, bias=True)
)