# KGQA4MAT Questions and Queries Translation
- Method 1: Zero-shot learning from MOF-KG ontology only. Prompt ChatGPT with the MOF-KG ontology definitions. Instruct ChatGPT to translate a user question to a Cypher query based on the ontology.
- Method 2: 1-shot learning from a pair of train question and query. Using the embeddings of the test and train questions to find the most similar train question to the test question. Prompt ChatGPT with the pair of matched train question and query. Instruct ChatGPT to translate a test question to a Cypher query.
- Method 3: 1-shot learning from the MOF-KG ontology and a pair of train question and query. As in Method 2, include MOF-KG ontology in the prompt, in addition to a pair of matched question and query.
- Method 4: 1-shot learning from a pair of train question 0.876 and query, and the chain-of-thought of the train query. As in Method 2, include the chain-of-thought of the train query in the prompt, in addition to the pair of matched question and query
- Method 5: 1-shot learning from the MOF-KG ontology and a pair of train question and query, and the chain-of-thought of the train query. As in Method 3, include the chain-of-thought of the train query in the prompt, in addition to MOF-KG ontology and the pair of matched question and query.

In [None]:
import pandas as pd
import numpy as np
import json
import pickle, os, random

## Load the train-80 and test-20 datasets

In [None]:
test = pd.read_csv('../data/KGQA4MAT-test-Method1.csv')
test.shape

In [None]:
test.columns

In [None]:
test.head()

In [None]:
train = pd.read_csv('../data/KGQA4MAT-train.csv')
train.shape

In [None]:
train.head()

## Query Neo4j Database

In [None]:
from neo4j import GraphDatabase

class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
            #response = session.run(query, parameters)
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [None]:
conn = Neo4jConnection(uri="bolt://localhost:7687", 
                       user="neo4j",              
                       pwd="neo4j4mofs")

## Retrieve the Answers for Train and Test and Save Them as JSON Strings

In [None]:
answer = []
for idx, row in train.iterrows():
    query = row['query']
    results = conn.query(query)
    answer.append(json.dumps([dict(_) for _ in results]))

In [None]:
len(answer)

In [None]:
count = 0
for item in answer:
    if not item:
        print(item)

In [None]:
train['answer'] = answer

In [None]:
answer_test = []
for idx, row in test.iterrows():
    query = row['query']
    results = conn.query(query)
    answer_test.append(json.dumps([dict(_) for _ in results]))

In [None]:
len(answer_test)

In [None]:
test['answer'] = answer_test

## Generate explanations to each question

In [None]:
import openai

In [None]:
openai_key_path = "OPENAI_KEY_PATH"

with open(openai_key_path, 'r') as f:
    openai_key = f.readline()

openai.api_key = openai_key

In [None]:
from tqdm import tqdm

thoughts_train = []
for idx, row in tqdm(train.iterrows(), total=train.shape[0]):
    query = row['query']
    
    msg = "Explain the reasoning in the Cypher query in logical steps. \
       Use normal words in the explanation. No comments. \
       Output the explanation only. Query:"
    msg += query

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant focusing on \
                a Neo4j database called MOF-KG. The database is \
                about Metal-Organic Frameworks."},
            {"role": "user", "content": msg}
        ]
    )

    thoughts_train.append(response['choices'][0]['message']['content'])

In [None]:
train['chainOfThought'] = [item.strip() for item in thoughts_train]

In [None]:
#train.to_csv('../data/KGQA4MAT-train-with-embeddings-cot.csv', index=None)

In [None]:
train = pd.read_csv("../data/KGQA4MAT-train-with-embeddings-cot.csv")
train.shape

In [None]:
train.columns

## Use ChatGPT to evaluate the test questions
- Method 1: Given the ontology, ask ChatGPT to directly translate the questions to queries
- Method 2: Given the training data, ask ChatGPT to use few shots to translate the questions to queries
- Method 3: Given ontology and the training data, ask ChatGPT to use few shots to translate the questions to queries
- Method 4: Given the training data, and chain-of-thought of the training data, ask ChatGPT to use few shots to translate the questions to queries with chain of thought.
- Method 5: Given ontology, the training data, and chain-of-thought of the training data, ask ChatGPT to use few shots to translate the questions to queries with chain of thought.

### Method 1: Given the ontology, ask ChatGPT to directly translate the questions to queries

In [None]:
sys_content1 = """
Assuming you have a neo4j database called MOF-KG. The database has the following node definitions: 
"MOF"["refcode":ID, "disorder_details", "remarks", "r_factor", "radiation_source", "z_prime", "has_3d_structure", "has_disorder", "cell_volume", "z_value", "database_name", "cell_lengths", "cell_angles", "deposition_date", "source", "color", "mofid", "datasource", "deposition_number", "mofkey", "charged_framework", "hydrogen_added", "unreliable_chemistry", "name", "filename", "percentage_void_space", "melting_point", "density", "sohncke_space_group", "solvent", "pressure", "temperature", "chemical_name", "chemical_formula"];
"Atom"["name":ID];
"Publication"["doi":ID, "year", "first_page", "volume"];
"Journal"["name":ID];
"Author"["name":ID, "order"];
"Bond"["name":ID];
"Crystal_System"["name":ID];
"Family"["name":ID];
"Metal"["name":ID];
"Organic_Linker"["name":ID];
"Topology"["rcsr_code":ID];
"Space_Group"["name":ID];
"Synthesis_Procedure"["name", "syn_id":ID, "symbol", "method", "MOratio"];
"Title"["title":ID];
"Synthesis_Operation"["DataSource", "order", "op_id":ID, "name"];
"Synthesis_Operation_Condition"["Property", "Value", "Unit", "order", "cond_id":ID];
"Synthesis_Time"["time_id":ID, "Value", "Unit"];
"Synthesis_Temperature"["Unit", "temp_id":ID, "Value"];
"Synthesis_Property"["value", "unit", "prop_id":ID, "order", "prop"];
"Synthesis_Metal_Precursor"["name", "mprec_id":ID, "formula"];
"Metal_Composition"["comp_id":ID, "unit", "order", "value"];
"Synthesis_Organic_Precursor"["name", "oprec_id":ID, "smiles"];
"Organic_Composition"["comp_id":ID, "unit", "order", "value"];
"Synthesis_Solvent_Precursor"["sprec_id":ID, "name"];
"Solvent_Composition"["comp_id":ID, "unit", "order", "value"];
"Solvent_Precursor_Solvent"["solv_id":ID, "solvent", "order"].
"""

In [None]:
sys_content1

In [None]:
sys_content2 = """
MOF-KG has the following relationships:
"(:MOF)->[:has_topology]->(:Topology)";
"(:MOF)->[:has_organicLinker]->(:Organic_Linker)";
"(:MOF)->[:has_metal]->(:Metal)";
"(:MOF)->[:has_spaceGroup]->(:Space_Group)";
"(:MOF)->[:has_crystalSystem]->(:Crystal_System)";
"(:MOF)->[:has_family]->(:Family)";
"(:MOF)->[:has_bond]->(:Bond)";
"(:MOF)->[:has_Atom]->(:Space_Atom)";
"(:MOF)->[:has_publication]->(:Publication)";
"(:Publication)->[:has_title]->(:Title)";
"(:Publication)->[:in_journal]->(:Journal)";
"(:Publication)->[:has_author]->(:Author)";
"(:Synthesis_Procedure)->[:has_publication]->(:Publication)";
"(:Synthesis_Procedure)->[:has_temperature]->(:Synthesis_Temperature)";
"(:Synthesis_Procedure)->[:has_time]->(:Synthesis_Time)";
"(:Synthesis_Procedure)->[:has_property]->(:Synthesis_Property)";
"(:Synthesis_Procedure)->[:has_metal_precursor]->(:Synthesis_Metal_Precursor)";
"(:Synthesis_Procedure)->[:has_organic_precursor]->(:Synthesis_Organic_Precursor)";
"(:Synthesis_Procedure)->[:has_solvent_precursor]->(:Synthesis_Solvent_Precursor)";
"(:Synthesis_Procedure)->[:has_operation]->(:Synthesis_Operation)";
"(:Synthesis_Operation)->[:has_condition]->(:Synthesis_Operation_Condition)";
"(:Synthesis_Operation)->[:has_next]->(:Synthesis_Operation)";
"(:Synthesis_Metal_Precursor)->[:has_composition]->(:Metal_Composition)";
"(:Synthesis_Organic_Precursor)->[:has_composition]->(:Organic_Composition)";
"(:Synthesis_Solvent_Precursor)->[:has_solvent]->(:Solvent_Precursor_Solvent)";
"(:Synthesis_Solvent_Precursor)->[:has_composition]->(:Solvent_Composition)".
"""

In [None]:
sys_content2

In [None]:
from tqdm import tqdm

schema_query = []

for idx, row in tqdm(test.iterrows(), total=test.shape[0]):
    
    user_question = row['question']
    
    msg = "Translate the question to Cypher query using the MOF-KG database. \
        No comments. Output syntactically correct query only. End query with ';'. \
        Question:{}. Query:"
    msg = msg.format(user_question)
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": sys_content1},
            {"role": "system", "content": sys_content2},
            {"role": "user", "content": msg}
        ]
    )
    
    schema_query.append(response['choices'][0]['message']['content'].strip().replace('\n', ' ' ))

In [None]:
schema_query

In [None]:
schema_query_cln = [item.replace('Query:', '').strip() for item in schema_query]
schema_query_cln

In [None]:
#test['ontology_direct_query'] = schema_query_cln

In [None]:
schema_query_answer = []
for idx, row in tqdm(test.iterrows(), total=test.shape[0]):
    query = row['schema_query']
    try:
        results = conn.query(query)
        schema_query_answer.append(json.dumps([dict(_) for _ in results]))
    except:
        schema_query_answer.append('ERROR')

In [None]:
schema_query_answer

In [None]:
#test['ontology_direct_query_answer'] = schema_query_answer

In [None]:
#test[['question', 'query', 'answer', 'ontology_direct_query', 'ontology_direct_query_answer']].\
# to_csv('../data/KGQA4MAT-test-Method1-zero-shot-ontology-direct.csv', index=None)

### Recover the test data with qsq_idx, question_idx, etc

In [None]:
test_idxes = pd.read_csv('../data/KGQA4MAT-test-20-with-idxes.csv')
test_idxes.shape

In [None]:
test['qsq_list_idx'] = test_idxes.qsq_list_idx

In [None]:
test['question_idx'] = test_idxes.question_idx

In [None]:
#test[['question', 'qsq_list_idx', 'question_idx', 'query', 'answer', 'ontology_direct_query', \
# 'ontology_direct_query_answer']].\
# to_csv('../data/KGQA4MAT-test-Method1-zero-shot-ontology-direct.csv', index=None)

In [None]:
test_method1 = pd.read_csv("../data/KGQA4MAT-test-Method1-zero-shot-ontology-direct.csv")
test_method1.shape

In [None]:
test_method1.columns

### Embed the train quesetions for similar question search

In [None]:
# imports
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding

In [None]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [None]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
#train["question_embedding"] = train.question.apply(lambda x: get_embedding(x, engine=embedding_model))

In [None]:
#train.to_csv('../data/KGQA4MAT-train-with-embeddings-cot.csv', index=None)

### Embed test question and search similar train questions

In [None]:
from openai.embeddings_utils import get_embedding, cosine_similarity

In [None]:
test.columns

In [None]:
test_UiO66 = test[['question', 'query']]
test_UiO66.shape

In [None]:
MOF_names = {'HKUST-1':'UiO66', 'MOF-3':'MIL-53', 'MOF-5':'URMOF-2', 'IRMOF':'PCN'}

In [None]:
def multipleReplace(text, wordDict):
    for key in wordDict:
        text = text.replace(key, wordDict[key])
    return text

In [None]:
test_UiO66['query'] = test_UiO66['query'].apply(lambda x: multipleReplace(x, MOF_names))

In [None]:
test_UiO66['question'] = test_UiO66.question.apply(lambda x: multipleReplace(x, MOF_names))

In [None]:
test_UiO66.head()

In [None]:
query_answer = []
#chatgpt_query_answer = []

for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66.shape[0]):
    
    query = row['query']
    #chatgpt_query = row['chatgpt_test_query']
    
    try:
        results = conn.query(query)
        query_answer.append(json.dumps([dict(_) for _ in results]))
    except:
        query_answer.append('ERROR')

In [None]:
test_UiO66['answer'] = query_anwer

In [None]:
# test_UiO66.to_csv("../data/KGQA4MAT-test.csv", index=None)

In [None]:
def search_question_query(df, input_question, n=3, pprint=True):
    input_question_embedding = get_embedding(
        input_question,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.question_embedding.apply(lambda x: cosine_similarity(x, input_question_embedding))

    results = \
        df.sort_values("similarity", ascending=False) \
        .head(n)[['question', 'query']]
    
    return input_question_embedding, results['question'].values[0], results['query'].values[0]

In [None]:
test_question_embeddings = []
for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66.shape[0]):
    test_question = row['question']
    question_embedding = get_embedding(
        test_question,
        engine="text-embedding-ada-002"
    )
    test_question_embeddings.append(question_embedding)

In [None]:
test_UiO66['test_question_embedding'] = test_question_embeddings

In [None]:
train.columns

In [None]:
from openai.embeddings_utils import get_embedding, cosine_similarity

train_matched_questions = []
train_matched_queries = []
rain_matched_chainOfThoughts = []
for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66.shape[0]):
    test_question_embedding = row['test_question_embedding']
    
    train_embeddings_similarities = train.question_embedding.apply(lambda x: cosine_similarity(x,test_question_embedding))
    
    max_idx = train_embeddings_similarities.idxmax()
    
    results = \
        train.iloc[max_idx][['question', 'query']]
    
    matched_question = results['question']
    matched_query = results['query']
    matched_cot = results['chainOfThought']

    train_matched_questions.append(matched_question)
    train_matched_queries.append(matched_query)
    train_matched_chainOfThoughts.append(matched_cot)

In [None]:
test_UiO66['train_matched_question'] = train_matched_questions
test_UiO66['train_matched_query'] = train_matched_queries
test_UiO66['train_matched_chainOfThought'] = train_matched_chainOfThoughts

In [None]:
#test_UiO66.to_csv('../data/KGQA4MAT-test-with-embeddings-matched-question.csv', index=None)

### Method 2: 1-shot learning from a pair of train question and query.

In [None]:
from tqdm import tqdm

m2_query = []

for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66_cot.shape[0]):
    
    test_question = row['question']
    
    train_question = row['train_matched_question']
    train_query = row['train_matched_query']

    sys_question_query = """
        Learn the following example question and corresponding query. \
        Learn the connections in the example query on the MOF-KG database. \
        The query uses the terms defined in the MOF-KG database. \
        You can only use these given terms defined in the MOF-KG in future queries. \ 
        Question:{}
        Query:{}
    """
    sys_question_query = sys_question_query.format(train_question, train_query)
    
    msg = """
        Use the learned connections and terms in the example query. \
        Translate the following question into a Cypher query.  \
        Use the similar structure as the given example query. \
        Output query only. No comments. End the query with ';'. \
        Question:{}
        Query: 
    """
    msg = msg.format(test_question)
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Assuming  you have a Neo4J database called MOF-KG. \
             The database is about Metal-Organic Framework. \
             You are a helpful assistant to translate questions to Cypher \
             queries on the database. "},
            {"role": "system", "content": sys_question_query},
            {"role": "user", "content": msg}
        ]
    )
    
    m2_query.append(response['choices'][0]['message']['content'].strip().replace('\n', ' ' ))

In [None]:
test_UiO66['method2_query'] = m2_query

In [None]:
m2_query_answer = []

for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66_cot.shape[0]):
    
    m2_query = row['method2_query']
    
    try:
        results = conn.query(m2_query)
        m2_query_answer.append(json.dumps([dict(_) for _ in results]))
    except:
        m2_query_answer.append('ERROR')

In [None]:
test_UiO66['method2_query_answer'] = m2_query_answer

In [None]:
#test_UiO66[['question', 'query', 'answer', 'method2_query', 'method2_query_answer', 
#                'train_matched_question', 'train_matched_query']].\
# to_csv('../data/KGQA4MAT-test-Method2-one-shot-train-only.csv', index=None)

### Method 3: 1-shot learning from the MOF-KG ontology and a pair of train question and query. 

In [None]:
from tqdm import tqdm

m3_query = []

for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66_cot.shape[0]):
    
    test_question = row['question']
    
    train_question = row['train_matched_question']
    train_query = row['train_matched_query']

    sys_question_query = """
        Learn the following example question and corresponding query. \
        Learn the connections in the example query on the MOF-KG database. \
        The query uses the terms defined in the MOF-KG database. \
        You can only use these given terms defined in the MOF-KG in future queries. \ 
        Question:{}
        Query:{}
    """
    sys_question_query = sys_question_query.format(train_question, train_query)
    
    msg = """
        Use the learned connections and terms in the example query and MOF-KG database. \
        Translate the following question into a Cypher query.  \
        Use the similar structure as the given example query. \
        Output query only. No comments. End the query with ';'. \
        Question:{}
        Query: 
    """
    msg = msg.format(test_question)
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Assuming  you have a Neo4J database called MOF-KG. \
             The database is about Metal-Organic Framework. \
             You are a helpful assistant to translate questions to Cypher \
             queries on the database. "},
            {"role": "system", "content": sys_content1},
            {"role": "system", "content": sys_content2},
            {"role": "system", "content": sys_question_query},
            {"role": "user", "content": msg}
        ]
    )
    
    m3_query.append(response['choices'][0]['message']['content'].strip().replace('\n', ' ' ))

In [None]:
test_UiO66['method3_query'] = m3_query

In [None]:
m3_query_answer = []

for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66_cot.shape[0]):
    
    m3_query = row['method3_query']
    
    try:
        results = conn.query(cot_query)
        m3_query_answer.append(json.dumps([dict(_) for _ in results]))
    except:
        m3_query_answer.append('ERROR')

In [None]:
test_UiO66['method3_query_answer'] = m3_query_answer

In [None]:
#test_UiO66[['question', 'query', 'answer', 'method3_query', 'method3_query_answer', 
#                'train_matched_question', 'train_matched_query']].\
# to_csv('../data/KGQA4MAT-test-Method3-one-shot-ontology-train-only.csv', index=None)

### Method 4: 1-shot learning from a pair of train question and query, and the chain-of-thought of the train query.

In [None]:
from tqdm import tqdm

m4_query = []

for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66_cot.shape[0]):
    
    test_question = row['question']
    
    train_question = row['train_matched_question']
    train_query = row['train_matched_query']
    train_matched_cot = row['train_matched_chainOfThought']
    
    sys_question_query = """
        Learn the following example question and corresponding query. \
        Learn the query steps in the explanation of the query. \
        You can only use the 
        terms defined in the MOF-KG database and in the example query. 
        Question: {} 
        Query: {}
        Explanation: {}
    """
    sys_question_query = sys_question_query.format(train_question, train_query, 
                                                  train_matched_cot)
    
    msg = "Only use the terms in the given example query and the MOF-KG database. \
           Follow the similar structure of the example query. \
           Follow the steps in the explanation of the example query.\
           Translate the following question to Cypher query. \
           Output query only. No comments. \
           Output syntactically correct query only. End query with ';'. \
           Question:{}. Query:"
    msg = msg.format(test_question)
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Assuming  you have a Neo4J database called MOF-KG. \
             The database is about Metal-Organic Framework. \
             You are a helpful assistant to translate questions to Cypher \
             queries on the database. "},
            {"role": "system", "content": sys_question_query},
            {"role": "user", "content": msg}
        ]
    )
    
    m4_query.append(response['choices'][0]['message']['content'].strip().replace('\n', ' ' ))

In [None]:
test_UiO66['method4_query'] = m4_query

In [None]:
m4_query_answer = []

for idx, row in tqdm(test_UiO66.iterrows(), total=test_UiO66_cot.shape[0]):
    
    #query = row['query']
    m4_query = row['method4_query']
    
    try:
        results = conn.query(cot_query)
        m4_query_answer.append(json.dumps([dict(_) for _ in results]))
    except:
        m4_query_answer.append('ERROR')

In [None]:
test_UiO66['method4_query_answer'] = m4_query_answer

In [None]:
#test_UiO66[['question', 'query', 'answer', 'method4_query', 'method4_query_answer', 
#                'train_matched_question', 'train_matched_query', 'train_matched_chainOfThought']].\
# to_csv('../data/KGQA4MAT-test-Method4-one-shot-train-cot.csv', index=None)

### Method 5: 1-shot learning from the MOF-KG ontology and a pair of train question and query, and the chain-of-thought of the train query.

In [None]:
sys_content1 = """
Assuming you have a neo4j database called MOF-KG. The database has the following node definitions: 
"MOF"["refcode":ID, "disorder_details", "remarks", "r_factor", "radiation_source", "z_prime", "has_3d_structure", "has_disorder", "cell_volume", "z_value", "database_name", "cell_lengths", "cell_angles", "deposition_date", "source", "color", "mofid", "datasource", "deposition_number", "mofkey", "charged_framework", "hydrogen_added", "unreliable_chemistry", "name", "filename", "percentage_void_space", "melting_point", "density", "sohncke_space_group", "solvent", "pressure", "temperature", "chemical_name", "chemical_formula"];
"Atom"["name":ID];
"Publication"["doi":ID, "year", "first_page", "volume"];
"Journal"["name":ID];
"Author"["name":ID, "order"];
"Bond"["name":ID];
"Crystal_System"["name":ID];
"Family"["name":ID];
"Metal"["name":ID];
"Organic_Linker"["name":ID];
"Topology"["rcsr_code":ID];
"Space_Group"["name":ID];
"Synthesis_Procedure"["name", "syn_id":ID, "symbol", "method", "MOratio"];
"Title"["title":ID];
"Synthesis_Operation"["DataSource", "order", "op_id":ID, "name"];
"Synthesis_Operation_Condition"["Property", "Value", "Unit", "order", "cond_id":ID];
"Synthesis_Time"["time_id":ID, "Value", "Unit"];
"Synthesis_Temperature"["Unit", "temp_id":ID, "Value"];
"Synthesis_Property"["value", "unit", "prop_id":ID, "order", "prop"];
"Synthesis_Metal_Precursor"["name", "mprec_id":ID, "formula"];
"Metal_Composition"["comp_id":ID, "unit", "order", "value"];
"Synthesis_Organic_Precursor"["name", "oprec_id":ID, "smiles"];
"Organic_Composition"["comp_id":ID, "unit", "order", "value"];
"Synthesis_Solvent_Precursor"["sprec_id":ID, "name"];
"Solvent_Composition"["comp_id":ID, "unit", "order", "value"];
"Solvent_Precursor_Solvent"["solv_id":ID, "solvent", "order"].
"""

In [None]:
sys_content2 = """
MOF-KG has the following relationships:
"(:MOF)->[:has_topology]->(:Topology)";
"(:MOF)->[:has_organicLinker]->(:Organic_Linker)";
"(:MOF)->[:has_metal]->(:Metal)";
"(:MOF)->[:has_spaceGroup]->(:Space_Group)";
"(:MOF)->[:has_crystalSystem]->(:Crystal_System)";
"(:MOF)->[:has_family]->(:Family)";
"(:MOF)->[:has_bond]->(:Bond)";
"(:MOF)->[:has_Atom]->(:Space_Atom)";
"(:MOF)->[:has_publication]->(:Publication)";
"(:Publication)->[:has_title]->(:Title)";
"(:Publication)->[:in_journal]->(:Journal)";
"(:Publication)->[:has_author]->(:Author)";
"(:Synthesis_Procedure)->[:has_publication]->(:Publication)";
"(:Synthesis_Procedure)->[:has_temperature]->(:Synthesis_Temperature)";
"(:Synthesis_Procedure)->[:has_time]->(:Synthesis_Time)";
"(:Synthesis_Procedure)->[:has_property]->(:Synthesis_Property)";
"(:Synthesis_Procedure)->[:has_metal_precursor]->(:Synthesis_Metal_Precursor)";
"(:Synthesis_Procedure)->[:has_organic_precursor]->(:Synthesis_Organic_Precursor)";
"(:Synthesis_Procedure)->[:has_solvent_precursor]->(:Synthesis_Solvent_Precursor)";
"(:Synthesis_Procedure)->[:has_operation]->(:Synthesis_Operation)";
"(:Synthesis_Operation)->[:has_condition]->(:Synthesis_Operation_Condition)";
"(:Synthesis_Operation)->[:has_next]->(:Synthesis_Operation)";
"(:Synthesis_Metal_Precursor)->[:has_composition]->(:Metal_Composition)";
"(:Synthesis_Organic_Precursor)->[:has_composition]->(:Organic_Composition)";
"(:Synthesis_Solvent_Precursor)->[:has_solvent]->(:Solvent_Precursor_Solvent)";
"(:Synthesis_Solvent_Precursor)->[:has_composition]->(:Solvent_Composition)".
"""

In [None]:
from tqdm import tqdm

m5_query = []

for idx, row in tqdm(test_UiO66_cot.iterrows(), total=test_UiO66_cot.shape[0]):
    
    test_question = row['question']
    
    train_question = row['train_matched_question']
    train_query = row['train_matched_query']
    train_matched_cot = row['train_matched_chainOfThought']
    
    sys_question_query = """
        Learn the following example question and corresponding query. \
        Learn the query steps in the explanation of the query. \
        You can only use the 
        terms defined in the MOF-KG database and in the example query. 
        Question: {} 
        Query: {}
        Explanation: {}
    """
    sys_question_query = sys_question_query.format(train_question, train_query, 
                                                  train_matched_cot)
    
    msg = "Only use the terms in the given example query and the MOF-KG database. \
           Follow the similar structure of the example query. \
           Follow the steps in the explanation of the example query.\
           Translate the following question to Cypher query. \
           Output query only. No comments. \
           Output syntactically correct query only. End query with ';'. \
           Question:{}. Query:"
    msg = msg.format(test_question)
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Assuming  you have a Neo4J database called MOF-KG. \
             The database is about Metal-Organic Framework. \
             You are a helpful assistant to translate questions to Cypher \
             queries on the database. "},
            {"role": "system", "content": sys_content1},
            {"role": "system", "content": sys_content2},
            {"role": "system", "content": sys_question_query},
            {"role": "user", "content": msg}
        ]
    )
    
    m5_query.append(response['choices'][0]['message']['content'].strip().replace('\n', ' ' ))

In [None]:
test_UiO66['method5_query'] = m5_query

In [None]:
m5_query_answer = []

for idx, row in tqdm(test_UiO66_cot.iterrows(), total=test_UiO66_cot.shape[0]):
    
    m5_query = row['method5_query']
    
    try:
        results = conn.query(cot_query)
        cot_query_answer.append(json.dumps([dict(_) for _ in results]))
    except:
        m5_query_answer.append('ERROR')

In [None]:
test_UiO66['method5_query_answer'] = m5_query_answer

In [None]:
#test_UiO66[['question', 'query', 'answer', 'method5_query', 'method5_query_answer', 
#                'train_matched_question', 'train_matched_query', 'train_matched_chainOfThought']].\
# to_csv('../data/KGQA4MAT-test-Method5-one-shot-ontology-train-cot.csv', index=None)