In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import zipfile

def load_embeddings(embeddings_zip, embeddings_file):
  """Loads embeddings from a Numberbatch file inside a zip archive.

  Args:
      embeddings_zip: Path to the 'numberbatch-en.zip' file.
      embeddings_file: Name of the 'numberbatch-en.txt' file within the zip archive.

  Returns:
      A tuple of (embeddings, vocabulary)
          embeddings: A NumPy array of shape (num_concepts, embedding_dim)
          vocabulary: A list of concept labels in the order they appear in the embeddings array
  """

  with zipfile.ZipFile(embeddings_zip, 'r') as zip_ref:
    with zip_ref.open(embeddings_file) as f:
      header = f.readline().decode('utf-8').split()  # Read dimensions
      num_concepts, embedding_dim = int(header[0]), int(header[1])
      print("num_concepts: ", num_concepts, " embedding dim: ", embedding_dim)

      embeddings = np.zeros((num_concepts, embedding_dim))
      vocabulary = []

      for i, line in enumerate(f):
        parts = line.decode('utf-8').strip().split()  # Decode bytes
        vocabulary.append(parts[0])

        try:
          embeddings[i] = np.array(parts[1:], dtype=np.float32)
        except ValueError as e:
          print(f"Error on line {i+2}: {line}")
          print(f"Error message: {e}")

  return embeddings, vocabulary

# Load the embeddings and vocabulary
embeddings, vocabulary = load_embeddings("/content/drive/My Drive/WikiData/numberbatch-en.zip", "numberbatch-en.txt")
print(embeddings.shape)
print(len(vocabulary))


num_concepts:  516782  embedding dim:  300
(516782, 300)
516782


In [5]:
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModel

# Load pre-trained BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Or another suitable BERT model
model1 = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
import gzip
conceptnet_data = {}
with gzip.open("/content/drive/My Drive/WikiData/conceptnet-assertions-5.7.0.csv.gz", 'rt') as f:  # 'rt' for reading text
    for line in f:
        _, relation, start_concept, end_concept, extra_info = line.split('\t')
        extra_info_dict = json.loads(extra_info)  # Load JSON from extra_info

        if 'surfaceText' in extra_info_dict:
            surface_text = extra_info_dict['surfaceText']

            if start_concept not in conceptnet_data:
                 conceptnet_data[start_concept] = []
            if surface_text != None and surface_text!="null":
              conceptnet_data[start_concept].append(surface_text)  # Store the surface text
            # Include sense_label
            if 'sense_label' in extra_info_dict:
                sense_label = extra_info_dict['sense_label']
            else:
                sense_label = None  # Handle cases where it might be absent
            if 'label' in extra_info_dict:
                label = extra_info_dict['label']
            else:
                label = None
            if label != None and label!="null":
                conceptnet_data[start_concept].append(label)
            if sense_label != None and sense_label!="null":
                conceptnet_data[start_concept].append(sense_label)





In [8]:
test = list(conceptnet_data.keys())[50:60]
print(len(list(conceptnet_data.keys())))

for t in test:
  print(t,": ",conceptnet_data[t])

1183400
/c/en/animal :  ['[[animal]] is the opposite of [[bird]]', '[[animal]] is the opposite of [[human]]', '[[animal]] is the opposite of [[human plants]]', '[[animal]] is the opposite of [[man]]', '[[animal]] is the opposite of [[mineral]]', '[[animal]] is the opposite of [[mineral vegetable]]', '[[animal]] is the opposite of [[person]]', '[[animal]] is the opposite of [[plant]]', '[[animal]] is the opposite of [[plants]]', '[[animal]] is the opposite of [[vegetable]]', '[[animal]] is the opposite of [[vegetable mineral]]', 'You are likely to find [[an animal]] in [[a cafe]]', 'You are likely to find [[an animal]] in [[the Detroit zoo]]', 'You are likely to find [[an animal]] in [[every breathing thing]]', 'You are likely to find [[an animal]] in [[North America]]', 'You are likely to find [[an animal]] in [[outside]]', '*Something you find at [[the park]] is [[animal]]', 'You are likely to find [[an animal]] in [[a pet shop]]', 'You are likely to find [[an animal]] in [[a pet stor

In [9]:
print(vocabulary.index("able"))

s= "/c/en/able"

print(s[6:])
print(s[6])
print(s[:6])

1479
able
a
/c/en/


In [10]:
import requests
import torch

import gzip
import json
import requests
import torch
import pandas as pd
def get_concept_description(concept_label):
    if concept_label in conceptnet_data:
        return conceptnet_data[concept_label]
    else:
        print("No descriptions found for:", concept_label)
        return []

# Create an empty list to store the data for the dataframe
data = []


i=0
# Iterate directly through conceptnet_data
for concept_label, descriptions in conceptnet_data.items():
    if concept_label[6] != 'a' or concept_label[:6]!="/c/en/":
      continue
    i=i+1
    if descriptions:
        if i%1000==0:
          print(i,": ",concept_label)
        try:
            embedding = embeddings[vocabulary.index(concept_label[6:])]
        except ValueError:
            print("ERROR: ",concept_label)
            embedding = None  # Handle cases where concept_label might not be in 'vocabulary'
            continue
        text_embeddings = []
        for description in descriptions:
            inputs = tokenizer(description, padding=True, truncation=True, return_tensors="pt")
            with torch.no_grad():
                outputs = model1(**inputs)
                token_embeddings = outputs.last_hidden_state
            text_embeddings.append(token_embeddings)
        data.append({'text': text_embeddings, 'embedding': embedding})
# Create the pandas dataframe
df = pd.DataFrame(data)

# Display the dataframe
print(df)
print(len(df))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ERROR:  /c/en/advertent/a/wn
ERROR:  /c/en/advertised/a/wn
ERROR:  /c/en/advised/a/wn
ERROR:  /c/en/aerated/a/wn
ERROR:  /c/en/aeriform/a/wn
ERROR:  /c/en/aerobic/a/wn
ERROR:  /c/en/aerobiotic/a/wn
ERROR:  /c/en/aerosolised/a/wn
ERROR:  /c/en/aesthetic/a/wn
ERROR:  /c/en/aestival/a/wn
ERROR:  /c/en/afeared/a/wn
ERROR:  /c/en/affable/a/wn
ERROR:  /c/en/affecting/a/wn
ERROR:  /c/en/affectional/a/wn
ERROR:  /c/en/affectionate/a/wn
ERROR:  /c/en/affiliated/a/wn
ERROR:  /c/en/affined/a/wn
ERROR:  /c/en/affirmable/a/wn
ERROR:  /c/en/affixed/a/wn
ERROR:  /c/en/afflicted/a/wn
ERROR:  /c/en/afflictive/a/wn
ERROR:  /c/en/affordable/a/wn
ERROR:  /c/en/afloat/a/wn
ERROR:  /c/en/aflutter/a/wn
ERROR:  /c/en/afoot/a/wn
ERROR:  /c/en/aforementioned/a/wn
ERROR:  /c/en/aforethought/a/wn
ERROR:  /c/en/afro_american/a/wn
ERROR:  /c/en/after/a/wn
ERROR:  /c/en/after_hours/a/wn
ERROR:  /c/en/after_school/a/wn
ERROR:  /c/en/aftermost/a/wn
ERROR

In [11]:
# Save the dataframe as a CSV file
df.to_csv("concept_embeddings.csv")

In [28]:
import torch.utils.data.dataset as dataset

class WikiDataset(dataset.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Flatten text embeddings into a single tensor
        text_embeddings = torch.cat([token_embedding
                                     for description in self.df.iloc[index]['text']
                                     for token_embedding in description])
        # Convert kg_embedding to a PyTorch tensor
        kg_embedding = torch.tensor(self.df.iloc[index]['embedding']).float()
        return text_embeddings, kg_embedding


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

# **1. Text Encoder**
class TextEncoder(nn.Module):
    def __init__(self, bert_embedding_size, lstm_hidden_size, encoder_hidden_size, latent_size):
        super().__init__()
        self.lstm1 = nn.LSTM(bert_embedding_size, lstm_hidden_size)
        self.lstm2 = nn.LSTM(lstm_hidden_size, lstm_hidden_size)

        self.linear1 = nn.Linear(lstm_hidden_size, encoder_hidden_size)
        self.linear2 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear3 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear4 = nn.Linear(encoder_hidden_size, latent_size)
        self.linear_mu = nn.Linear(latent_size, latent_size)
        self.linear_logvar = nn.Linear(latent_size, latent_size)

    def forward(self, text_embeddings):
        out, hidden = self.lstm1(text_embeddings)
        out, hidden = self.lstm2(out, hidden)    # Pass to second lstm
        out = hidden[0][-1]

        out = torch.relu(self.linear1(out))  # Encoder layers
        out = torch.relu(self.linear2(out))
        out = torch.relu(self.linear3(out))
        out = torch.relu(self.linear4(out))


        mu = self.linear_mu(out)
        logvar = self.linear_logvar(out)

        # Reparameterization trick for backpropagation through sampling
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z = eps.mul(std).add_(mu)  # Sample from the latent distribution
        return z, mu, logvar

# **2. Knowledge Graph Embedding Decoder**
class KGDecoder(nn.Module):
    def __init__(self, latent_size, decoder_hidden_size, kg_embedding_size):
        super().__init__()
        self.linear1 = nn.Linear(latent_size, decoder_hidden_size)
        self.linear2 = nn.Linear(decoder_hidden_size, decoder_hidden_size)  # Additional layers
        self.linear3 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear4 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.output_layer = nn.Linear(decoder_hidden_size, kg_embedding_size)

    def forward(self, z):
        out = torch.relu(self.linear1(z))
        out = torch.relu(self.linear2(out))
        out = torch.relu(self.linear3(out))
        out = torch.relu(self.linear4(out))
        out = self.output_layer(out)
        return out


# **3. Model, Loss, and Training**
class HybridVAE(nn.Module):
    def __init__(self, text_encoder, kg_decoder):
        super().__init__()
        self.text_encoder = text_encoder
        self.kg_decoder = kg_decoder

    def forward(self, text_embeddings):
        z, mu, logvar = self.text_encoder(text_embeddings)
        kg_embeddings_pred = self.kg_decoder(z)
        return kg_embeddings_pred, mu, logvar




# **Modified Loss Function**
def loss_function(kg_embeddings_pred, kg_embeddings, mu, logvar, lengths):
    kg_embeddings_pred = kg_embeddings_pred[0]
    reconstruction_losses = F.mse_loss(kg_embeddings_pred, kg_embeddings, reduction='none')
    reconstruction_loss = torch.sum(reconstruction_losses * lengths.float().unsqueeze(1)) / torch.sum(lengths)  # Weighted average
    kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_divergence




import torch
from torch.nn.utils.rnn import pad_sequence

def custom_batch_generator(dataset, batch_size):
    batch = []
    for i in range(len(dataset)):
        sample = dataset[i]
        text_embeddings, kg_embedding = sample[0], sample[1]
        batch.append((text_embeddings.clone().detach(), kg_embedding))
        if len(batch) == batch_size:
            lengths = torch.tensor([len(x) for x, _ in batch])
            # Packing logic
            text_embeddings_packed = pad_sequence(
                [x[0] for x in batch],  # Extract text embeddings
                batch_first=False
            )
            kg_embeddings = torch.stack([y for _, y in batch])
            yield text_embeddings_packed, kg_embeddings, lengths
            batch = []

    # Yield the last batch if it has less than batch_size elements
    if batch:
        lengths = torch.tensor([len(x) for x, _ in batch])
        text_embeddings_packed = pad_sequence(
                [x[0] for x in batch],  # Extract text embeddings
                batch_first=True
            )
        kg_embeddings = torch.stack([y for _, y in batch])
        yield text_embeddings_packed, kg_embeddings, lengths






# Instantiate models (adjust hyperparameters)
text_encoder = TextEncoder(bert_embedding_size=768,  # If using BERT-base
                           lstm_hidden_size=256,
                           encoder_hidden_size=128,
                           latent_size=32)

kg_decoder = KGDecoder(latent_size=32,
                       decoder_hidden_size=128,
                       kg_embedding_size=300)

model = HybridVAE(text_encoder, kg_decoder)  # Clean instantiation


# **Training Loop**
optimizer = optim.Adam(model.parameters())

for epoch in range(500):
    total_loss=0
    for text_embeddings_packed, kg_embeddings, lengths in custom_batch_generator(WikiDataset(df), batch_size=8):
        optimizer.zero_grad()
        # Encoding
        z, mu, logvar = model.text_encoder(text_embeddings_packed)

        # Decoding
        kg_embeddings_pred = model.kg_decoder(z)

        # Loss Calculation
        loss = loss_function(kg_embeddings_pred, kg_embeddings, z, mu, lengths)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()  # Add to epoch loss
    # Print epoch information
    epoch_loss = total_loss / 1000
    print(f'Epoch {epoch+1}: Loss - {epoch_loss:.4f}')



  reconstruction_losses = F.mse_loss(kg_embeddings_pred, kg_embeddings, reduction='none')
  reconstruction_losses = F.mse_loss(kg_embeddings_pred, kg_embeddings, reduction='none')


Epoch 1: Loss - 4.0131
Epoch 2: Loss - 0.1480
Epoch 3: Loss - 0.1470
Epoch 4: Loss - 0.1465
Epoch 5: Loss - 0.1467
Epoch 6: Loss - 0.1671
Epoch 7: Loss - 0.1419
Epoch 8: Loss - 0.1417
Epoch 9: Loss - 0.1452
Epoch 10: Loss - 0.1418
Epoch 11: Loss - 0.1527
Epoch 12: Loss - 0.1408
Epoch 13: Loss - 0.1413
Epoch 14: Loss - 0.1431
Epoch 15: Loss - 0.1418
Epoch 16: Loss - 0.1415
Epoch 17: Loss - 0.1434
Epoch 18: Loss - 0.1427
Epoch 19: Loss - 0.1420
Epoch 20: Loss - 0.1446
Epoch 21: Loss - 0.1416
Epoch 22: Loss - 0.1412
Epoch 23: Loss - 0.1494
Epoch 24: Loss - 0.1402
Epoch 25: Loss - 0.1416
Epoch 26: Loss - 0.1405
Epoch 27: Loss - 0.1461
Epoch 28: Loss - 0.1407
Epoch 29: Loss - 0.1406


In [42]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

# **1. Text Encoder**
class TextEncoder(nn.Module):
    def __init__(self, bert_embedding_size, lstm_hidden_size, encoder_hidden_size, latent_size):
        super().__init__()
        self.lstm1 = nn.LSTM(bert_embedding_size, lstm_hidden_size)
        self.lstm2 = nn.LSTM(lstm_hidden_size, lstm_hidden_size)
        self.lstm3 = nn.LSTM(lstm_hidden_size, lstm_hidden_size)
        self.lstm4 = nn.LSTM(lstm_hidden_size, lstm_hidden_size)
        self.lstm5 = nn.LSTM(lstm_hidden_size, lstm_hidden_size)

        self.linear1 = nn.Linear(lstm_hidden_size, encoder_hidden_size)
        self.linear2 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear3 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear4 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear5 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear6 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear7 = nn.Linear(encoder_hidden_size, encoder_hidden_size)
        self.linear8 = nn.Linear(encoder_hidden_size, latent_size)
        self.linear_mu = nn.Linear(latent_size, latent_size)
        self.linear_logvar = nn.Linear(latent_size, latent_size)

    def forward(self, text_embeddings):
        out, hidden = self.lstm1(text_embeddings)
        out, hidden = self.lstm2(out, hidden)    # Pass to second lstm
        out, hidden = self.lstm3(out, hidden)
        out, hidden = self.lstm4(out, hidden)
        out, hidden = self.lstm5(out, hidden)
        out = hidden[0][-1]

        out = torch.relu(self.linear1(out))  # Encoder layers
        out = torch.relu(self.linear2(out))
        out = torch.relu(self.linear3(out))
        out = torch.relu(self.linear4(out))
        out = torch.relu(self.linear5(out))
        out = torch.relu(self.linear6(out))
        out = torch.relu(self.linear7(out))
        out = torch.relu(self.linear8(out))


        mu = self.linear_mu(out)
        logvar = self.linear_logvar(out)

        # Reparameterization trick for backpropagation through sampling
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z = eps.mul(std).add_(mu)  # Sample from the latent distribution
        return z, mu, logvar

# **2. Knowledge Graph Embedding Decoder**
class KGDecoder(nn.Module):
    def __init__(self, latent_size, decoder_hidden_size, kg_embedding_size):
        super().__init__()
        self.linear1 = nn.Linear(latent_size, decoder_hidden_size)
        self.linear2 = nn.Linear(decoder_hidden_size, decoder_hidden_size)  # Additional layers
        self.linear3 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear4 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear5 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear6 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear7 = nn.Linear(decoder_hidden_size, decoder_hidden_size)  # Additional layers
        self.linear8 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear9 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear10 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.linear11 = nn.Linear(decoder_hidden_size, decoder_hidden_size)
        self.output_layer = nn.Linear(decoder_hidden_size, kg_embedding_size)

    def forward(self, z):
        out = torch.relu(self.linear1(z))
        out = torch.relu(self.linear2(out))
        out = torch.relu(self.linear3(out))
        out = torch.relu(self.linear4(out))
        out = torch.relu(self.linear5(out))
        out = torch.relu(self.linear6(out))
        out = torch.relu(self.linear7(out))
        out = torch.relu(self.linear8(out))
        out = torch.relu(self.linear9(out))
        out = torch.relu(self.linear10(out))
        out = torch.relu(self.linear11(out))
        out = self.output_layer(out)
        return out


# **3. Model, Loss, and Training**
class HybridVAE(nn.Module):
    def __init__(self, text_encoder, kg_decoder):
        super().__init__()
        self.text_encoder = text_encoder
        self.kg_decoder = kg_decoder

    def forward(self, text_embeddings):
        z, mu, logvar = self.text_encoder(text_embeddings)
        kg_embeddings_pred = self.kg_decoder(z)
        return kg_embeddings_pred, mu, logvar




# **Modified Loss Function**
def loss_function(kg_embeddings_pred, kg_embeddings, mu, logvar, lengths):
    kg_embeddings_pred = kg_embeddings_pred[0]
    reconstruction_losses = F.mse_loss(kg_embeddings_pred, kg_embeddings, reduction='none')
    reconstruction_loss = torch.sum(reconstruction_losses * lengths.float().unsqueeze(1)) / torch.sum(lengths)  # Weighted average
    kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return reconstruction_loss + kl_divergence




import torch
from torch.nn.utils.rnn import pad_sequence

def custom_batch_generator(dataset, batch_size):
    batch = []
    for i in range(len(dataset)):
        sample = dataset[i]
        text_embeddings, kg_embedding = sample[0], sample[1]
        batch.append((text_embeddings.clone().detach(), kg_embedding))
        if len(batch) == batch_size:
            lengths = torch.tensor([len(x) for x, _ in batch])
            # Packing logic
            text_embeddings_packed = pad_sequence(
                [x[0] for x in batch],  # Extract text embeddings
                batch_first=False
            )
            kg_embeddings = torch.stack([y for _, y in batch])
            yield text_embeddings_packed, kg_embeddings, lengths
            batch = []

    # Yield the last batch if it has less than batch_size elements
    if batch:
        lengths = torch.tensor([len(x) for x, _ in batch])
        text_embeddings_packed = pad_sequence(
                [x[0] for x in batch],  # Extract text embeddings
                batch_first=True
            )
        kg_embeddings = torch.stack([y for _, y in batch])
        yield text_embeddings_packed, kg_embeddings, lengths






# Instantiate models (adjust hyperparameters)
text_encoder = TextEncoder(bert_embedding_size=768,  # If using BERT-base
                           lstm_hidden_size=256,
                           encoder_hidden_size=128,
                           latent_size=32)

kg_decoder = KGDecoder(latent_size=32,
                       decoder_hidden_size=128,
                       kg_embedding_size=300)

model = HybridVAE(text_encoder, kg_decoder)  # Clean instantiation


# **Training Loop**
optimizer = optim.Adam(model.parameters())

for epoch in range(500):
    total_loss=0
    for text_embeddings_packed, kg_embeddings, lengths in custom_batch_generator(WikiDataset(df), batch_size=8):
        optimizer.zero_grad()
        # Encoding
        z, mu, logvar = model.text_encoder(text_embeddings_packed)

        # Decoding
        kg_embeddings_pred = model.kg_decoder(z)

        # Loss Calculation
        loss = loss_function(kg_embeddings_pred, kg_embeddings, z, mu, lengths)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()  # Add to epoch loss
    # Print epoch information
    epoch_loss = total_loss / 1000
    print(f'Epoch {epoch+1}: Loss - {epoch_loss:.4f}')



  reconstruction_losses = F.mse_loss(kg_embeddings_pred, kg_embeddings, reduction='none')
  reconstruction_losses = F.mse_loss(kg_embeddings_pred, kg_embeddings, reduction='none')


Epoch 1: Loss - 4.7251


KeyboardInterrupt: 



"What is the capital of France?",
    "Who was the 22nd president?",
    "Who fought the bravest in the eastern front of WW2?",
    "Who was the most sadistic figure in history?",
    "Which straight public figure was actually homosexual?",
    "Flying high in april and shot down in may",
    "Gubernatorial woman",
    "all your friends are being outcompeted in Piston Cup"

In [41]:
from torch.nn.utils.rnn import pad_sequence
# *** Placeholders for your 8 input queries ***
input_queries = [
    "Name a large [[country]] in [[America]].",
    "What is the name of a [[tall mountain range]] in [[Europe]]?",
    "What is a soft-fleeced [[animal]] related to the camel?",
    "What is the [[chemical symbol]] for [[gold]]?",
    "What is the largest [[organ]] in the [[human body]]? ",
    "Which ancient [[civilization]] built the [[pyramids]]?",
    "Which famous [[Greek philosopher]] was the [[teacher]] of [[Alexander the Great]]?",
    "What is the [[capital city]] of [[France]]?"
]


# BERT encoder, tokenizer
model1 = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



# Encode input queries using BERT and HybridVAE

with torch.no_grad():
  encoded_queries = []
  max_length = 0  # Initialize max_length
  for query in input_queries:
    inputs = tokenizer(query, return_tensors="pt")
    attention_mask = inputs.pop('attention_mask')
    max_length = max(max_length, attention_mask.shape[-1])

    outputs = model1(**inputs)
    token_embeddings = outputs.last_hidden_state
    #print(token_embeddings.shape)
    #print(token_embeddings.squeeze(0).shape)
    encoded_queries.append(token_embeddings.squeeze(0))
  padded_queries = pad_sequence(encoded_queries, batch_first=False, padding_value=0)
  text_embeddings = padded_queries
  print(text_embeddings.shape)


  # VAE Encoding
  latent_embeddings, _, _  = model.text_encoder(text_embeddings)
  print(latent_embeddings.shape)

  # VAE Decoding
  kg_embeddings = model.kg_decoder(latent_embeddings)
  kg_embeddings_np = kg_embeddings.numpy()

print(kg_embeddings_np.shape)
def calculate_cosine_similarity(emb, concept_embedding):
    dot_product = np.dot(emb, concept_embedding)
    magnitude_emb = np.linalg.norm(emb)
    magnitude_concept_embedding = np.linalg.norm(concept_embedding)
    cos_sim = dot_product / (magnitude_emb * magnitude_concept_embedding)
    return cos_sim

# Find nearest neighbors in ConceptNet embeddings
for query_idx, emb in enumerate(kg_embeddings_np):
    # Calculate cosine similarities against each embedding individually
    distances = []
    for idx, concept_embedding in enumerate(embeddings):
        cos_sim = calculate_cosine_similarity(emb, concept_embedding)
        distances.append(cos_sim)

    top_5_indices = np.argsort(distances)[-5:]

    print(f"Query: {input_queries[query_idx]}")
    print("Closest Entities:")

    for concept_idx in top_5_indices:
        concept_label = vocabulary[concept_idx]
        distance = distances[concept_idx]  # Use the index for distances
        print(f"  - Concept Label: {concept_label} (Distance: {distance:.4f})")


torch.Size([26, 8, 768])
torch.Size([8, 32])
(8, 300)
Query: Name a large [[continent]] in the [[Western Hemisphere]].
Closest Entities:
  - Concept Label: oxyarc (Distance: 0.3522)
  - Concept Label: iva (Distance: 0.3551)
  - Concept Label: almucantar_staff (Distance: 0.3556)
  - Concept Label: lawn_dart_effect (Distance: 0.3567)
  - Concept Label: aca (Distance: 0.3618)


KeyboardInterrupt: 