In [1]:
import os

# Construct the relative path
cypher_file_path = os.path.join('..', '..', '..', 'cypher', 'filetypes', 'filetypes.cypher')

# Verify the path exists
if not os.path.exists(cypher_file_path):
    raise FileNotFoundError(f"Cypher file not found at: {cypher_file_path}")

# Read the file
with open(cypher_file_path, 'r') as file:
    cypher_content = file.read()

# Now you can use cypher_content

In [2]:
# Split the cypher_content into individual instructions
instructions = [instruction.strip() for instruction in cypher_content.split(';') if instruction.strip()]

In [3]:
import os
from dotenv import load_dotenv

# Load .env file relative to the project root
project_root = os.path.join('..', '..', '..', '.env')  # Adjust based on your structure
load_dotenv(project_root)

# Neo4j credentials
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

In [4]:
from neo4j import GraphDatabase

class Neo4jConnector:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def run_query(self, query, **kwargs):
        with self.driver.session() as session:
            result = session.run(query, **kwargs)
            return list(result)  # Convert to list if needed

# Initialize connection
neo4j_conn = Neo4jConnector(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

# Execute each Cypher instruction separately
for i, instruction in enumerate(instructions, 1):
    try:
        print(f"\nExecuting instruction {i}/{len(instructions)}: {instruction[:50]}...")  # Show first 50 chars
        results = neo4j_conn.run_query(instruction)
        
        # Print results if the query returns data
        if results:
            print(f"Results from instruction {i}:")
            for record in results:
                print(record)
        else:
            print(f"Instruction {i} executed successfully (no results returned)")
            
    except Exception as e:
        print(f"Error executing instruction {i}: {str(e)}")
        # Continue or break based on your requirements
        continue  # or break to stop on first error
# Close connection
neo4j_conn.close()


Executing instruction 1/19: // Create nodes for each file type with their prop...
Instruction 1 executed successfully (no results returned)

Executing instruction 2/19: MERGE (:FileType {
  name: "Canadian Well Log ASCI...
Instruction 2 executed successfully (no results returned)

Executing instruction 3/19: MERGE (:FileType {
  name: "Text (ASCII)",
  exten...
Instruction 3 executed successfully (no results returned)

Executing instruction 4/19: MERGE (:FileType {
  name: "Comma Separated Values...
Instruction 4 executed successfully (no results returned)

Executing instruction 5/19: MERGE (:FileType {
  name: "Tab Delimited (ASCII)"...
Instruction 5 executed successfully (no results returned)

Executing instruction 6/19: MERGE (:FileType {
  name: "Metafile",
  extension...
Instruction 6 executed successfully (no results returned)

Executing instruction 7/19: MERGE (:FileType {
  name: "Windows Metafile",
  e...
Instruction 7 executed successfully (no results returned)

Executing in

In [5]:
import os
import json
import faiss
import ollama
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# 1. Load JSON file
json_file_path = os.path.join('..', '..', '..', 'Training_Info', 'filetypes.json')
with open(json_file_path, 'r') as f:
    filetypes = json.load(f)

In [7]:
# 2. Chunking: Flatten each filetype into a single string
chunks = []
chunk_id_to_info = {}

for idx, ftype in enumerate(filetypes):
    chunk_text = f"""
Name: {ftype['name']}
Extension(s): {ftype['extension']}
Description: {ftype['description']}
Loadable in GEO: {ftype['load']}
Exportable from GEO: {ftype['export']}
"""
    chunks.append(chunk_text)
    chunk_id_to_info[idx] = chunk_text

In [8]:
# ✅ Additional Cell: View the generated chunks
for idx, chunk in enumerate(chunks):
    print(f"Chunk {idx}:\n{chunk}\n{'-'*50}")

Chunk 0:

Name: LIS-79
Extension(s): LIS, TAP, NTI, TIF
Description: LIS data (usually wireline data) is converted to an ASCII format called LAS before loading into GEO. There are different kinds of LIS, which are supported by the LIS-LAS Converter routine.
Loadable in GEO: No
Exportable from GEO: No

--------------------------------------------------
Chunk 1:

Name: Canadian Well Log ASCII (CWLAS)
Extension(s): LASData_load
Description: This is a special ASCII implementation of wireline data that allows for specific categories of information to coexist with data. GEO can load LAS data directly.
Loadable in GEO: Yes
Exportable from GEO: Yes

--------------------------------------------------
Chunk 2:

Name: Text (ASCII)
Extension(s): ASC, TXTData_load
Description: Mudlog, MWD and well test data are usually in the Text File format. Sometimes, the data is presented in columns which are delimited by SPACES. Such Text files MUST be prepared (externally or within GEO) before they can be loa

In [9]:
# 3. Embedding
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(chunks, convert_to_numpy=True)

In [10]:
# 4. Vector store using FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [11]:
# 5. RAG Function to handle a query
def answer_question(question, k=3):
    # Embed the question
    query_vec = embedder.encode([question], convert_to_numpy=True)
    
    # Retrieve top-k chunks
    D, I = index.search(query_vec, k)
    top_chunks = [chunk_id_to_info[i] for i in I[0]]
    
    # Display top chunks for debugging or review
    print("\n[Top Retrieved Chunks]:")
    for i, chunk in enumerate(top_chunks, 1):
        print(f"\nChunk {i}:\n{chunk}\n" + "-"*40)

    # Construct prompt using relevant chunks
    context = "\n---\n".join(
        f"{{\nName: {chunk.get('name')},\nExtension: {chunk.get('extension')},\nDescription: {chunk.get('description')},\nLoadable in GEO: {chunk.get('load')},\nExportable from GEO: {chunk.get('export')}\n}}"
        if isinstance(chunk, dict) else chunk  # fallback for raw string chunks
        for chunk in top_chunks
    )

    # Enhanced system prompt
    system_prompt = f"""You are a geoscience file format expert. The user will ask about file types used in GEO. Each chunk below contains information in structured format with fields like 'Name', 'Extension', 'Description', 'Loadable in GEO', and 'Exportable from GEO'.

    Use ONLY the following context to answer the question. Do NOT use external knowledge. 

    Context:
    {context}
    """
    
    # Ask TinyLLaMA
    response = ollama.chat(
        model='llama3.2:1b',
        messages=[
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': question}
        ]
    )
    return response['message']['content']


In [12]:
# 🔍 Example
print(answer_question("What is a VIEW file?"))


[Top Retrieved Chunks]:

Chunk 1:

Name: GEODraft View File
Extension(s): GDV
Description: This is a file format equivalent to the VEW, but created by the GEODraft application. These files may be opened by applications of the GEO Software Suite to allow full data sharing with users of the GEODraft application.
Loadable in GEO: Yes
Exportable from GEO: Yes

----------------------------------------

Chunk 2:

Name: VIEW
Extension(s): VEW
Description: The View file contains the layout-related settings needed to create a log. These settings are normally entered by the user and may be stored for ease of recall when the same layout is required for subsequent well logs, i.e., the standard composite log presentation.
Loadable in GEO: Yes
Exportable from GEO: Yes

----------------------------------------

Chunk 3:

Name: Output Database File
Extension(s): ODF
Description: The Output Database File is a database where the wireline, mudlog or any other imported data, together with the layout-rela

In [None]:
# 🔍 Example
print(answer_question("What is a GeoGraph Database file and can it be exported?"))