<a href="https://colab.research.google.com/github/jcmachicao/cisti2025_earlysymbolicknowledge/blob/main/general_simbolico.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Early Injection of Symbolic Concepts (Experiment 5)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
import json
import numpy as np
from scipy.spatial.distance import cosine

In [None]:
# Cargar frases desde los archivos JSON
path = "D://2025 hot writing/pp_symbolic_knowledge_LLM/"

with open(path+"general_phrases.json", "r", encoding="utf-8") as f:
    general_data = json.load(f)
    general_sentences = [sentence.split() for sentence in general_data["phrases"]]

with open(path+"spatial_phrases.json", "r", encoding="utf-8") as f:
    spatial_data = json.load(f)
    spatial_sentences = [sentence.split() for sentence in spatial_data["spatial_data"]]

In [None]:
# Entrenar modelos Word2Vec para el modelo base y el simbólico
word2vec_general = Word2Vec(sentences=general_sentences, vector_size=50, window=5, min_count=1, workers=4)
combined_sentences = general_sentences + spatial_sentences
word2vec_symbolic = Word2Vec(sentences=combined_sentences, vector_size=50, window=5, min_count=1, workers=4)

# Obtener embeddings de palabras clave espaciales
spatial_words = ["movimiento", "rápido", "borde", "encima", "debajo", "cerca", "lejos"]
spatial_embeddings = torch.tensor([word2vec_symbolic.wv[word] for word in spatial_words if word in word2vec_symbolic.wv])

  spatial_embeddings = torch.tensor([word2vec_symbolic.wv[word] for word in spatial_words if word in word2vec_symbolic.wv])


In [None]:
# Definir embeddings simbólicos
class SpatialEmbedding(nn.Module):
    def __init__(self, embed_size, spatial_embeddings):
        super(SpatialEmbedding, self).__init__()
        self.embedding = nn.Parameter(spatial_embeddings.mean(dim=0))  # Centroide de los embeddings

    def forward(self, x):
        return x + self.embedding

# Modelo simple basado en embeddings simbólicos
class SymbolicModel(nn.Module):
    def __init__(self, embed_size=50):
        super(SymbolicModel, self).__init__()
        self.spatial_embedding = SpatialEmbedding(embed_size, spatial_embeddings)
        self.fc = nn.Linear(embed_size, 1)

    def forward(self, x):
        x = self.spatial_embedding(x)
        x = self.fc(x.mean(dim=1))  # Promedio de embeddings
        return torch.sigmoid(x)

In [None]:
# Función para codificar y hacer padding de las oraciones
def encode_sentences(sentences, model, max_len):
    encoded = []
    for sentence in sentences:
        vecs = [model.wv[word] for word in sentence if word in model.wv]
        if len(vecs) < max_len:
            vecs += [np.zeros(model.vector_size)] * (max_len - len(vecs))  # Padding con ceros
        encoded.append(vecs[:max_len])  # Truncar si excede max_len
    return torch.tensor(encoded, dtype=torch.float32)

def find_closest_sentences(test_embedding, sentences, model, top_n=3):
    sentence_embeddings = [np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0) for sentence in sentences]
    similarities = [(i, 1 - cosine(test_embedding, emb)) for i, emb in enumerate(sentence_embeddings) if emb is not None]
    similarities.sort(key=lambda x: x[1], reverse=True)
    return [" ".join(sentences[i]) for i, _ in similarities[:top_n]]

max_length = max(len(sentence) for sentence in general_sentences + spatial_sentences)

general_data = encode_sentences(general_sentences, word2vec_general, max_length)
spatial_data = encode_sentences(spatial_sentences, word2vec_symbolic, max_length)

# Asegurar que las etiquetas tengan la misma longitud que los datos
labels_general = torch.zeros(len(general_data), dtype=torch.float32)
labels_spatial = torch.ones(len(spatial_data), dtype=torch.float32)

In [None]:
# Entrenar modelo base
EPOCHS = 30
model_base = SymbolicModel()
optimizer = optim.Adam(model_base.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

data_combined = torch.cat([general_data, spatial_data], dim=0)
labels_combined = torch.cat([labels_general, labels_spatial], dim=0)

for epoch in range(EPOCHS):
    optimizer.zero_grad()
    outputs = model_base(data_combined).squeeze()
    loss = loss_fn(outputs, labels_combined)
    loss.backward()
    optimizer.step()
    print(f"[Modelo Base] Epoch {epoch+1}, Loss: {loss.item()}")

# Entrenar modelo simbólico
model_symbolic = SymbolicModel()
optimizer = optim.Adam(model_symbolic.parameters(), lr=0.01)

for epoch in range(EPOCHS):
    optimizer.zero_grad()
    outputs = model_symbolic(data_combined).squeeze()
    loss = loss_fn(outputs, labels_combined)
    loss.backward()
    optimizer.step()
    print(f"[Modelo Simbólico] Epoch {epoch+1}, Loss: {loss.item()}")

# Evaluación con una frase nueva
test_sentence_orig = "El plato podría caerse de la mesa por un descuido"
test_sentence = test_sentence_orig.split(' ')
print(test_sentence)
test_data_general = encode_sentences([test_sentence], word2vec_general, max_length)
test_data_symbolic = encode_sentences([test_sentence], word2vec_symbolic, max_length)
test_embed_general = np.mean([word2vec_general.wv[word] for word in test_sentence if word in word2vec_general.wv], axis=0)
test_embed_symbolic = np.mean([word2vec_symbolic.wv[word] for word in test_sentence if word in word2vec_symbolic.wv], axis=0)

with torch.no_grad():
    prediction_base = model_base(test_data_general).item()
    prediction_symbolic = model_symbolic(test_data_symbolic).item()
    closest_general = find_closest_sentences(test_embed_general, general_sentences, word2vec_general)
    closest_symbolic = find_closest_sentences(test_embed_symbolic, spatial_sentences, word2vec_symbolic)


print("Predicción del Modelo Base:", prediction_base)
print("Predicción del Modelo Simbólico:", prediction_symbolic)
print("Frases más cercanas en general:", closest_general)
print("Frases más cercanas en simbólico:", closest_symbolic)

[Modelo Base] Epoch 1, Loss: 0.6714127659797668
[Modelo Base] Epoch 2, Loss: 0.6620928645133972
[Modelo Base] Epoch 3, Loss: 0.6526336073875427
[Modelo Base] Epoch 4, Loss: 0.6427007913589478
[Modelo Base] Epoch 5, Loss: 0.6325231194496155
[Modelo Base] Epoch 6, Loss: 0.6224822402000427
[Modelo Base] Epoch 7, Loss: 0.6130567193031311
[Modelo Base] Epoch 8, Loss: 0.6048005223274231
[Modelo Base] Epoch 9, Loss: 0.5983045697212219
[Modelo Base] Epoch 10, Loss: 0.594116747379303
[Modelo Base] Epoch 11, Loss: 0.592588484287262
[Modelo Base] Epoch 12, Loss: 0.5936216115951538
[Modelo Base] Epoch 13, Loss: 0.5964096188545227
[Modelo Base] Epoch 14, Loss: 0.5995622277259827
[Modelo Base] Epoch 15, Loss: 0.6018022894859314
[Modelo Base] Epoch 16, Loss: 0.6025369167327881
[Modelo Base] Epoch 17, Loss: 0.6018377542495728
[Modelo Base] Epoch 18, Loss: 0.6001458764076233
[Modelo Base] Epoch 19, Loss: 0.598008394241333
[Modelo Base] Epoch 20, Loss: 0.5959154367446899
[Modelo Base] Epoch 21, Loss: 0.