# Script de Geração de Perguntas
Gerar perguntas a partir do Grafo de Conhecimento (PetroKGraph) e criar o contexto relevante para cada pergunta.

To begin, we need to import the necessary modules and set up the environment variables.

#### Install LangChain and Neo4j

```bash
!pip install networkx 
!pip install node2vec 
!pip install rdflib

!pip install openai
!pip install langchain
!pip install neo4j
!pip install transformers
```

In [49]:
import os
import json
import torch
import sentence_transformers
import rdflib
import networkx as nx
from node2vec import Node2Vec

### **✅ Carregar o PetroKGraph**

In [386]:
from rdflib import Graph, URIRef, Namespace, Literal

g = rdflib.Graph()
g.parse("C:\Projetos GIT\LangChain\PetroKGraph_update.rdf", format="xml") 

G = nx.Graph()

# Añadir nodos y aristas del grafo RDF al grafo de NetworkX
for subj, pred, obj in g:
   
    G.add_edge(str(subj), str(obj), label=str(pred))

In [None]:
# Imprimir nodos
print("Nodos del grafo:")
print(G.edges())

In [184]:
print(f"* * * * * * * * *")
# Numero de nodos, ejes del grafo y grado medio
nodos = G.number_of_nodes()
ejes = G.number_of_edges()
k = ejes*2/nodos
densidad = nx.density(G)
print(f"Grafo con {nodos} nodos, {ejes} ejes, densidad {densidad} y grado medio {k}")

# Grado de los nodos: el número de conexiones de cada nodo
nx.degree(G)

* * * * * * * * *
Grafo con 2441 nodos, 4389 ejes, densidad 0.0014737980268769181 y grado medio 3.5960671855796806


In [None]:
# Imprimir nodos
print("Nodos del grafo:")
print(G.nodes())

In [None]:
print("\nAristas del grafo con etiquetas:")
for edge in G.edges(data=True):
    print(edge)

### ✅ Verificar etiquetas del Grafo

In [None]:
def verificar_etiquetas(grafo):
  """
  Recorre el grafo y verifica si cada nodo tiene una etiqueta asociada.
  """
  nodos_sin_etiqueta = []
  for nodo in grafo.nodes:
    if not grafo.has_node(nodo):
      continue

    etiqueta = grafo.nodes[nodo].get("label", None)
    if not etiqueta:
      nodos_sin_etiqueta.append(nodo)

  return nodos_sin_etiqueta

grafo = G

nodos_sin_etiqueta = verificar_etiquetas(grafo)
if nodos_sin_etiqueta:
  print(f"Se encontraron {len(nodos_sin_etiqueta)} nodos sin etiqueta:")
  print(nodos_sin_etiqueta)
else:
  print("Todos los nodos tienen etiqueta.")

In [81]:
def verificar_etiquetas_con_etiqueta(grafo):
  """
  Recorre el grafo y verifica si cada nodo tiene una etiqueta asociada.
  Si un nodo tiene etiqueta, imprime el nodo y su etiqueta.
  """
  nodos_sin_etiqueta = verificar_etiquetas(grafo)
  if nodos_sin_etiqueta:
    print(f"Se encontraron {len(nodos_sin_etiqueta)} nodos sin etiqueta:")
    for nodo in nodos_sin_etiqueta:
      etiqueta = grafo.nodes[nodo].get("label", None)
      if etiqueta:
        print(f"- {nodo}: {etiqueta}")
  else:
    print("Todos los nodos tienen etiqueta.")

# Ejemplo de uso
grafo = G
# Agregue nodos al grafo...

verificar_etiquetas_con_etiqueta(grafo)


Se encontraron 1240 nodos sin etiqueta:


In [18]:
def verificar_etiquetas_manual(grafo):
  """
  Recorre el grafo manualmente y verifica si cada nodo tiene una etiqueta asociada.
  Si un nodo tiene etiqueta, imprime el nodo y su etiqueta.
  """
  for nodo in grafo.nodes:
    if not grafo.has_node(nodo):
      continue

    etiqueta = grafo.nodes[nodo].get("label", None)
    if etiqueta:
      print(f"- {nodo}: {etiqueta}")
    #else: print(f"No hay nodos con atributos tipo -label-")

grafo = G
verificar_etiquetas_manual(grafo)


In [23]:
def verificar_etiquetas_con_nx_get_node_attributes(grafo):
  """
  Recorre el grafo y verifica si cada nodo tiene una etiqueta asociada utilizando nx.get_node_attributes.
  Si un nodo tiene etiqueta, imprime el nodo y su etiqueta.
  """
  for nodo, atributos in nx.get_node_attributes(grafo, name='label').items():
    if "label" in atributos:
      print(f"- {nodo}: {atributos['label']}")
    else: 
      print(f"No hay nodos con atributos tipo -label-")

grafo = nx.Graph()
verificar_etiquetas_con_nx_get_node_attributes(grafo)


### ✅ **Inspección de las Triplas**


Primero, inspeccionar las triplas para ver cuántos elementos contienen. al parecer las "triples" encontrados tienen una cantidad significativamente mayor de elementos de lo esperado, esto va en concordancia a lo estudiado del grafo, lo que indica que están representando algo más complejo, siendo que son descripciones más detalladas de entidades.

In [12]:
# Inspeccionar las triplas
for triple in G:
    print(triple)
    print(f"Number of elements in triple: {len(triple)}")
    if len(triple) <= 3:
        print("Found a non-triple element")


http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#EXTERNO_EST_FISICA_ROCHA_006
Number of elements in triple: 98
http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#geological_structure
Number of elements in triple: 90
http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#BASE_CD_BACIA_300
Number of elements in triple: 87
http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#basin
Number of elements in triple: 75
http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#CAMP_CD_CAMPO_0279
Number of elements in triple: 88
http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#BASE_CD_BACIA_281
Number of elements in triple: 87
http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#membro_000
Number of elements in triple: 80
http://www.w3.org/2002/07/owl#NamedIndividual
Number of elements in triple: 45
http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#POCO_CD_

Las "triplas" encontradas contienen descripciones más detalladas de entidades, descartar estos elementos resultaría en la pérdida de datos, entonces se adaptará el código para manejar este caso adecuadamente. Una forma de abordar esto es manejar triplas con un número variable de elementos. En lugar de asumir que cada triple tiene exactamente tres elementos.

### ✅ **Consultar o PetroKGraph atualizado para obter informação relevante**

In [402]:
from rdflib import Graph, URIRef, Namespace, Literal

fields = {}
basins = {}
wells  = {}
formations = {}
for s, p, o in g:
    if isinstance(s, URIRef) and "CAMP_CD_CAMPO" in s:
        campo_id = s.split("#")[1]
        if campo_id not in fields:
            fields[campo_id] = {"types": [], "located_in": [], "labels": [], "related": []}
        if "type" in p:
            fields[campo_id]["types"].append(o)
        elif "located_in" in p:
            fields[campo_id]["located_in"].append(o)
        elif "label" in p:
            fields[campo_id]["labels"].append(str(o))
        else:
            fields[campo_id]["related"].append((p, o))

    if isinstance(s, URIRef) and "BASE_CD_BACIA" in s:
        bacia_id = s.split("#")[1]
        if bacia_id not in basins:
            basins[bacia_id] = {"types": [], "located_in": [], "labels": []}
        if "type" in p:
            basins[bacia_id]["types"].append(o)
        elif "located_in" in p:
            basins[bacia_id]["located_in"].append(o)
        elif "label" in p:
            basins[bacia_id]["labels"].append(str(o))
            
    if isinstance(s, URIRef) and "POCO_CD_POCO" in s:
        poco_id = s.split("#")[1]
        if poco_id not in wells:
            wells[poco_id] = {"types": [], "located_in": [], "labels": [], "crosses": []}
        if "type" in p:
            wells[poco_id]["types"].append(o)
        elif "located_in" in p:
            wells[poco_id]["located_in"].append(o)
        elif "crosses" in p:
            wells[poco_id]["crosses"].append(str(o))
        elif "label" in p:
            wells[poco_id]["labels"].append(str(o))

    if isinstance(s, URIRef) and "formacao" in s:
        unidade_lito_id = s.split("#")[1]
        if unidade_lito_id not in formations:
            formations[unidade_lito_id] = {"types": [], "located_in": [], "has_age": [], "part_of": [], "constituted_by": [], "crosses": [],  "labels": []}
        if "type" in p:
            formations[unidade_lito_id]["types"].append(o)
        elif "located_in" in p:
            formations[unidade_lito_id]["located_in"].append(str(o))  
        elif "constituted_by" in p:
            formations[unidade_lito_id]["constituted_by"].append(o) 
        elif "has_age" in p:
            formations[unidade_lito_id]["has_age"].append(str(o))  
        elif "part_of" in p:
            formations[unidade_lito_id]["part_of"].append(o)   
        
        elif "crosses" in p:
            formations[unidade_lito_id]["crosses"].append(str(o))
        elif "label" in p:
            formations[unidade_lito_id]["labels"].append(str(o))
          

    if isinstance(s, URIRef) and "grupo" in s:
        unidade_lito_id = s.split("#")[1]
        if unidade_lito_id not in formations:
            formations[unidade_lito_id] = {"types": [], "located_in": [], "has_age": [], "part_of": [], "constituted_by": [], "crosses": [],  "labels": []}
        if "type" in p:
            formations[unidade_lito_id]["types"].append(o)
        elif "located_in" in p:
            formations[unidade_lito_id]["located_in"].append(str(o))  
        elif "constituted_by" in p:
            formations[unidade_lito_id]["constituted_by"].append(o) 
        elif "has_age" in p:
            formations[unidade_lito_id]["has_age"].append(str(o))  
        elif "part_of" in p:
            formations[unidade_lito_id]["part_of"].append(o)   
        elif "crosses" in p:
            formations[unidade_lito_id]["crosses"].append(str(o))
        elif "label" in p:
            formations[unidade_lito_id]["labels"].append(str(o))
          

    if isinstance(s, URIRef) and "membro" in s:
        unidade_lito_id = s.split("#")[1]
        if unidade_lito_id not in formations:
            formations[unidade_lito_id] = {"types": [], "located_in": [], "has_age": [], "part_of": [], "constituted_by": [], "crosses": [],  "labels": [] }
        if "type" in p:
            formations[unidade_lito_id]["types"].append(o)
        elif "located_in" in p:
            formations[unidade_lito_id]["located_in"].append(str(o))  
        elif "constituted_by" in p:
            formations[unidade_lito_id]["constituted_by"].append(o) 
        elif "has_age" in p:
            formations[unidade_lito_id]["has_age"].append(str(o))  
        elif "part_of" in p:
            formations[unidade_lito_id]["part_of"].append(o)   
        elif "crosses" in p:
            formations[unidade_lito_id]["crosses"].append(str(o))
        elif "label" in p:
            formations[unidade_lito_id]["labels"].append(str(o))
          

####  ✅ Corroborar informação extraida

In [None]:
# Formações extraídos
for forma, info in formations.items():
        print(f"** Unidade Litoestretigrafica: {forma}")
        print(f"  Parte de: {info['part_of']}")
        print(f"  Constituted  by: {info['constituted_by']}")
        print(f"  Crosses  by: {info['crosses']}")
        print(f"  Label: {info['labels']}")
        print(f"  Tem idade: {info['has_age']}")

In [None]:
# Campos extraídos
for campo, info in fields.items():
    print(f"Campo: {campo}")
    #print(f"  Tipos: {info['types']}")
    print(f"  Located in: {info['located_in']}")
    print(f"  Label: {info['labels']}")

In [None]:
# Bacias extraídos
for well, info in wells.items():
    print(f"Bacia: {well}")
    #print(f"  Tipos: {info['types']}")
    print(f"  Nombre del poço: {info['labels']}")
    print(f"  El poço atravessa: {info['labels']}")

### 🔎 **Gerar as perguntas baseado no PetroKGraph** ✅ OK

In [439]:
namespace_base = rdflib.Namespace("http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
rdf= Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

def gerar_perguntas(g):
    questions = []
    for campo, info in fields.items():
        if info["located_in"]:
            location = info["located_in"][0].split("#")[1]
            campo_name = info["labels"][0] if info["labels"] else campo
            location_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{location}"), rdfs.label)
            questions.append({
                "question": f"Onde está localizado o campo {campo_name}?",
                "answer": str(location_name),
                "context": f"O campo {campo_name} está localizado na bacia {location_name}."
            })

    for well, info in wells.items():
        if info["located_in"]:
            location = info["located_in"][0].split("#")[1]
            well_name = info["labels"][0] if info["labels"] else well
            location_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{location}"), rdfs.label)
            questions.append({
                "question": f"Onde está localizado o poço {well_name}?",
                "answer": str(location_name),
                "context": f"O poço {well_name} está localizado na bacia {location_name}."
            })

    for well, info in wells.items():
        if info["crosses"]:
            cross_by = info["crosses"][0].split("#")[1]
            well_name = info["labels"][0] if info["labels"] else well
            cross_by_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{cross_by}"), rdfs.label)
            questions.append({
                "question": f"Onde atraavessa o poço {well_name}?",
                "answer": str(cross_by_name),
                "context": f"O poço {well_name} atravessa {cross_by_name}."
            })

    for formacoes, info in formations.items():
        if info["constituted_by"]:
            constituted_by_names = []
            formation_name = info["labels"][0] if info["labels"] else formacoes

            # constituted_by_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{constituted_by}"), rdfs.label)
            
            for material in info["constituted_by"]:
                material_id = material.split("#")[1]
                material_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{material_id}"), rdfs.label)
                if material_name:
                    constituted_by_names.append(str(material_name))
        
            if constituted_by_names:
                constituted_by_materials = ", ".join(constituted_by_names)
                questions.append({
                    "question": f"Qual é o material da terra que está constituida a formação/grupo/membro {formation_name}?",
                    "answer": str(constituted_by_materials),
                    "context": f"A formação/grupo/membro {formation_name} está constituida por {constituted_by_materials}."
                })

    for formacoes, info in formations.items():
        if info["part_of"]:
            part_of = info["part_of"][0].split("#")[1]
            formation_name = info["labels"][0] if info["labels"] else formacoes
            part_of_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{part_of}"), rdfs.label)
            questions.append({
                "question": f"Qual é a entidade que faz parte de {formation_name}?",
                "answer": str(part_of_name),
                "context": f"A formação/grupo/membro {formation_name} faz parte de {part_of_name}."
            })

    for formacoes, info in formations.items():
        if info["has_age"]:
            has_age = info["has_age"][0].split("#")[1]
            
            formation_name = info["labels"][0] if info["labels"] else formacoes
            
            has_age_name_unit = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{has_age}"), rdf.type)
            #print (has_age_name_unit)
            has_age_name = list(g.objects(URIRef(f"{has_age_name_unit}"), rdfs.label))[2]
            #print (has_age_name)
            questions.append({
                "question": f"Qual é a idade geológica de {formation_name}?",
                "answer": str(has_age_name),
                "context": f"A idade geológica de {formation_name} é {has_age_name}."
            })

    for formacoes, info in formations.items():
        
        if info["located_in"]:
            location = info["located_in"][0].split("#")[1]
            if location != "basin" : 
                formation_name = info["labels"][0] if info["labels"] else formacoes
                location_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{location}"), rdfs.label)
                #print (f"{formation_name}  ---> {location_name}")
                questions.append({
                    "question": f"Qual é a localização de {formation_name}?",
                    "answer": str(location_name),
                    "context": f"A {formation_name} está localizada na bacia {location_name}."
                })

    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ont: <http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#>

    SELECT ?field (COUNT(?well) as ?pozos)
    WHERE {
    ?well rdf:type ont:well .
    ?field rdf:type ont:field .
    ?well ont:located_in ?field .
    }
    GROUP BY ?field
    """

    results = g.query(query)

    for row in results:
        campo_uri = row.field
        count = row.pozos.toPython()  # Convertir Literal a un tipo Python nativo
        campo_name = g.value(campo_uri, rdfs.label)

        questions.append({
            "question": f"¿Quantos poços existem no campo {campo_name}?",
            "answer": str(count),
            "context": f"No campo {campo_name} existem no total {count} poços."
        })
                    
    return questions

In [None]:
namespace_base = rdflib.Namespace("http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
rdf= Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

def gerar_perguntas(g):
    questions = []
    for campo, info in fields.items():
        if info["located_in"]:
            location = info["located_in"][0].split("#")[1]
            campo_name = info["labels"][0] if info["labels"] else campo
            location_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{location}"), rdfs.label)
            questions.append({
                "question": f"Onde está localizado o campo {campo_name}?",
                "answer": str(location_name),
                "context": f"O campo {campo_name} está localizado na bacia {location_name}."
            })

    for well, info in wells.items():
        if info["located_in"]:
            location = info["located_in"][0].split("#")[1]
            well_name = info["labels"][0] if info["labels"] else well
            location_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{location}"), rdfs.label)
            questions.append({
                "question": f"Onde está localizado o poço {well_name}?",
                "answer": str(location_name),
                "context": f"O poço {well_name} está localizado na bacia {location_name}."
            })

    for well, info in wells.items():
        if info["crosses"]:
            cross_by = info["crosses"][0].split("#")[1]
            well_name = info["labels"][0] if info["labels"] else well
            cross_by_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{cross_by}"), rdfs.label)
            questions.append({
                "question": f"Onde atraavessa o poço {well_name}?",
                "answer": str(cross_by_name),
                "context": f"O poço {well_name} atravessa {cross_by_name}."
            })

    for formacoes, info in formations.items():
        if info["constituted_by"]:
            constituted_by_names = []
            formation_name = info["labels"][0] if info["labels"] else formacoes

            # constituted_by_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{constituted_by}"), rdfs.label)
            
            for material in info["constituted_by"]:
                material_id = material.split("#")[1]
                material_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{material_id}"), rdfs.label)
                if material_name:
                    constituted_by_names.append(str(material_name))
        
            if constituted_by_names:
                constituted_by_materials = ", ".join(constituted_by_names)
                questions.append({
                    "question": f"Qual é o material da terra que está constituida a formação/grupo/membro {formation_name}?",
                    "answer": str(constituted_by_materials),
                    "context": f"A formação/grupo/membro {formation_name} está constituida por {constituted_by_materials}."
                })

    for formacoes, info in formations.items():
        if info["part_of"]:
            part_of = info["part_of"][0].split("#")[1]
            formation_name = info["labels"][0] if info["labels"] else formacoes
            part_of_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{part_of}"), rdfs.label)
            questions.append({
                "question": f"Qual é a entidade que faz parte de {formation_name}?",
                "answer": str(part_of_name),
                "context": f"A formação/grupo/membro {formation_name} faz parte de {part_of_name}."
            })

    for formacoes, info in formations.items():
        if info["has_age"]:
            has_age = info["has_age"][0].split("#")[1]
            
            formation_name = info["labels"][0] if info["labels"] else formacoes
            
            has_age_name_unit = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{has_age}"), rdf.type)
            #print (has_age_name_unit)
            has_age_name = list(g.objects(URIRef(f"{has_age_name_unit}"), rdfs.label))[2]
            #print (has_age_name)
            questions.append({
                "question": f"Qual é a idade geológica de {formation_name}?",
                "answer": str(has_age_name),
                "context": f"A idade geológica de {formation_name} é {has_age_name}."
            })

    for formacoes, info in formations.items():
        
        if info["located_in"]:
            location = info["located_in"][0].split("#")[1]
            if location != "basin" : 
                formation_name = info["labels"][0] if info["labels"] else formacoes
                location_name = g.value(URIRef(f"http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#{location}"), rdfs.label)
                #print (f"{formation_name}  ---> {location_name}")
                questions.append({
                    "question": f"Qual é a localização de {formation_name}?",
                    "answer": str(location_name),
                    "context": f"A {formation_name} está localizada na bacia {location_name}."
                })

    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ont: <http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#>

    SELECT ?field (COUNT(?well) as ?pozos)
    WHERE {
    ?well rdf:type ont:well .
    ?field rdf:type ont:field .
    ?well ont:located_in ?field .
    }
    GROUP BY ?field
    """

    results = g.query(query)

    for row in results:
        campo_uri = row.field
        count = row.pozos.toPython()  # Convertir Literal a un tipo Python nativo
        campo_name = g.value(campo_uri, rdfs.label)

        questions.append({
            "question": f"¿Quantos poços existem no campo {campo_name}?",
            "answer": str(count),
            "context": f"No campo {campo_name} existem no total {count} poços."
        })
                    
    return questions

#### ✅**Guardar as perguntas num arquivo JSON** 

In [440]:
# Gerar os pares pergunta-resposta
dados_treinamento = gerar_perguntas(g)

with open('dataset_perguntas_respostas_update.json', 'w', encoding='utf-8') as file:
    json.dump(dados_treinamento, file, ensure_ascii=False, indent=4)