# instalar dependiencias y modelos

In [None]:
#Instalar librerias necesarias
! pip install --quiet langchain langchain-ollama langchain-community unstructured chromadb tiktoken langchain-text-splitters langgraph transformers sentence-transformers

In [None]:
#Ejecutar esta linea para instalar el modelo llama desde ollama (ollama debe estar instalado)
! ollama pull llama3.1:8b



In [None]:
#Ejecutar esta linea para instalar el modelo deepseek desde ollama (ollama debe estar instalado)
! ollama pull deepseek-v2:16b

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling d8d69f2a1bfa...   0% ▕                ▏    0 B/8.9 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling d8d69f2a1bfa...   0% ▕                ▏    0 B/8.9 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling d8d69f2a1bfa...   0% ▕                ▏    0 B/8.9 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling d8d69f2a1bfa...   0% ▕                ▏    0 B/8.9 GB                  [K[?25h[?2026l[?2026h[?25l[A[

# creacion de la base de datos con emmbedings

In [None]:
import os
import re
import json
import chromadb
from sentence_transformers import SentenceTransformer

# Inicializa el cliente de ChromaDB
chroma_client = chromadb.PersistentClient(path="chroma_cod_db")

# Se crea la base de datos en chroma
cod_collection = chroma_client.get_or_create_collection(name="cod_materials")

# Se llama un modelo para hacer embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#Ruta donde estan los archivos json
JSON_FOLDER = "cod_json_data"

def load_json_files_into_chromadb():
    """Carga los archivos json y los pasa a metadata en chromadb"""
    for filename in os.listdir(JSON_FOLDER):
        if filename.endswith(".json"):
            cod_id = filename.split(".json")[0]
            file_path = os.path.join(JSON_FOLDER, filename)

            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Se define que metadata guardar
            formula = data.get("chemical_formula", "Unknown Formula")
            space_group = data.get("space_group", "Unknown Space Group")
            unit_cell = data.get("lattice_parameters", {})
            a, b, c = unit_cell.get("a", "?"), unit_cell.get("b", "?"), unit_cell.get("c", "?")
            alpha, beta, gamma = unit_cell.get("alpha", "?"), unit_cell.get("beta", "?"), unit_cell.get("gamma", "?")
            atomic_sites = data.get("atomic_sites", [])
            atomic_data = []
            for atom in atomic_sites:
                label = atom.get("element", "?")
                x, y, z = atom.get("x", "?"), atom.get("y", "?"), atom.get("z", "?")
                occupancy = atom.get("occupancy", 1.0)
                atomic_data.append(f"{label}: ({x}, {y}, {z}), Occ: {occupancy}")
            symmetry_operations = data.get("symmetry_operations", [])

            # Se le da formato para buscar en ella despues
            metadata_text = (
                f"Formula: {formula}, Space Group: {space_group}, "
                f"Unit Cell: a={a}, b={b}, c={c}, alpha={alpha}, beta={beta}, gamma={gamma}, "
                f"Atomic Sites: {'; '.join(atomic_data)}, "
                f"Symmetry Operations: {', '.join(symmetry_operations)}"
            )

            #Se ejecuta el embedding
            embedding = embedding_model.encode(metadata_text).tolist()

            #Chroma no soporta diccionarios entonces se guarda la informacion como strings
            unit_cell_str = f"a={a}, b={b}, c={c}, alpha={alpha}, beta={beta}, gamma={gamma}"

            cod_collection.add(
                ids=[cod_id],  #Se le da como llave el ID en la base de datos COD
                embeddings=[embedding],
                metadatas=[{
                    "formula": formula,
                    "space_group": space_group,
                    "unit_cell": unit_cell_str,  
                    "atomic_sites": json.dumps(atomic_sites),  
                    "symmetry_operations": json.dumps(symmetry_operations) 
                }]
            )

            print(f"✅ Added {cod_id} to ChromaDB")

# Se ejecuta la funcion
load_json_files_into_chromadb()

In [None]:
def search_chromadb(query_data, top_k=3):
    """Busca en la base de datos de ChromaDB con parametros como el nombre del material, formula y grupo espacial"""
    
    formula = query_data.get("Formula", "").strip()
    space_group = query_data.get("Space_Group", "").strip()
    material_name = query_data.get("Material_Name", "").strip()

    # primero realiza busquedas exactas por formula, grupo espacial y nombre del material
    for key, value in [("formula", formula), ("space_group", space_group), ("material_name", material_name)]:
        if value and value != "Unknown":
            exact_match = cod_collection.get(where={key: value})
            if exact_match["metadatas"]:
                return [{
                    "cod_id": exact_match["ids"][0],
                    "formula": exact_match["metadatas"][0].get("formula", "Unknown"),
                    "space_group": exact_match["metadatas"][0].get("space_group", "Unknown"),
                    "unit_cell": exact_match["metadatas"][0].get("unit_cell", "Unknown")
                }]

    # Si no encuentra de forma exacta busca por similitud con embeddings
    query_embedding = embedding_model.encode(f"{material_name} {formula} {space_group}").tolist()
    results = cod_collection.query(query_embeddings=[query_embedding], n_results=top_k)

    if not results["metadatas"][0]:  # Si no encuentra similitudes devuelve un mensaje de error
        return {"error": f"No matching materials found for query: {query_data}"}

    # Retorna los resultados
    matches = []
    for i in range(len(results["ids"][0])):
        match = {
            "cod_id": results["ids"][0][i],
            "formula": results["metadatas"][0][i].get("formula", "Unknown"),
            "space_group": results["metadatas"][0][i].get("space_group", "Unknown"),
            "unit_cell": results["metadatas"][0][i].get("unit_cell", "Unknown"),
            "atomic_sites": results["metadatas"][0][i].get("atomic_sites", "[]"),
            "symmetry_operations": results["metadatas"][0][i].get("symmetry_operations", "[]")
        }
        matches.append(match)

    return matches

In [55]:
from langchain.llms import Ollama
from langchain.chains import LLMChain
from langchain.agents import AgentType, initialize_agent, OpenAIFunctionsAgent
from langchain.tools import Tool
from langchain.agents import AgentExecutor
from langchain.prompts import PromptTemplate
from Build_Sample import build_sample  # Importa la funcion para construir la muestra

# Inicializa un modelo de Ollama
llm = Ollama(model="llama3.1:8B")  
#llm = Ollama(model="deepseek-v2:16b")  # Se puede usar este modelo tambien

#Define un template para extraer los parametros de busqueda del prompt del usuario
prompt_material_extraction = PromptTemplate(
    input_variables=["query"],
    template="Extract the material name, chemical formula, space group, and supercell size from this request.\n\n"
             "Return the output in EXACTLY this format, with NO extra explanation and NO field names:\n"
             "Material_Name, Formula, Space_Group, Supercell_Size\n\n"
             "Example Inputs and Expected Outputs:\n"
             "User: 'I want a 2x2x2 supercell of quartz'\n"
             "Output: Quartz, SiO₂, Unknown, 2\n\n"
             "User: 'Build a 3x3x3 platinum supercell'\n"
             "Output: Platinum, Pt, Fm-3m, 3\n\n"
             "User: 'Find me a silicon structure with space group P3₁21'\n"
             "Output: Silicon, Si, P3₁21, 1\n\n"
             "User: 'Give me a hexagonal material'\n"
             "Output: Unknown, Unknown, Hexagonal, 1\n\n"
             "User: 'Show me information about iron oxide'\n"
             "Output: Iron Oxide, Fe₂O₃, Unknown, 1\n\n"
             "Query: {query}\n\n"
             "**ONLY OUTPUT THE VALUES, NOTHING ELSE**:"
)

# Define el llm con el template que va a usar
llm_chain_material = LLMChain(llm=llm, prompt=prompt_material_extraction)

# Definimos las herramientas del agente
def search_material_tool(query: str) -> str:
    """Interpreta el prompt del usuario y busca en ChromaDB para encontrar datos cristalográficos."""
    
    # usa el template para interpretar que busca el usuario
    llm_response = llm_chain_material.invoke({"query": query})

    if isinstance(llm_response, dict):  
        extracted_text = llm_response.get("text", "").strip()
    else:
        extracted_text = str(llm_response).strip()

    print(f"🔍 LLM Extracted: {extracted_text}")

    # tratamos de darle formato a la busqueda
    pattern = re.compile(
        r"([\w\s-]+),\s*([\w\d\s₂₃₄₅₆₇₈₉-]+),\s*([\w\d\s/-]+),\s*(\d+)", re.UNICODE
    )

    match = pattern.match(extracted_text)
    if not match:
        return f"❌ Error: Invalid structured query format: {extracted_text}"

    material_name, formula, space_group, size = match.groups()
    query_data = {
        "Material_Name": material_name.strip(),
        "Formula": formula.strip(),
        "Space_Group": space_group.strip(),
        "Supercell_Size": int(size)
    }

    print(f"✅ Parsed Query Data: {query_data}")

    #Buscamos en la base de datos
    search_results = search_chromadb(query_data)
    if "error" in search_results:
        return f"❌ No valid data found for query: {query}"

    best_match = search_results[0]  #usamos la primera coincidencia como la mejor
    #imprimimos la informacion
    print(
        f"🔹 **Material:** {best_match['formula']}\n"
        f"🔹 **COD ID:** {best_match['cod_id']}\n"
        f"🔹 **Space Group:** {best_match['space_group']}\n"
        f"🔹 **Unit Cell Parameters:** {best_match['unit_cell']}"
    )
  
     # retornamos la informacion
    return {
        #"size": query_data["Supercell_Size"],
        "retrieved_data": search_results[0]  #Toda la informacion queda guardada en este diccionario
    }


def build_supercell_tool(size: int, retrieved_data: dict) -> str:
    """Constructs a supercell using already retrieved crystallographic data."""
    
    # Validamos que la informacion sea correcta
    required_keys = ["unit_cell", "atomic_sites", "symmetry_operations"]
    
    for key in required_keys:
        if key not in retrieved_data:
            return f"❌ Error: Retrieved data is missing key '{key}'. Cannot build supercell."

    try:
        # Extraemos la informacion de la base de datos
        unit_cell = retrieved_data["unit_cell"]
        atomic_positions = json.loads(retrieved_data["atomic_sites"])  # Convert JSON string to list
        symmetry_operations = json.loads(retrieved_data["symmetry_operations"])  # Convert JSON string to list
    except KeyError as e:
        return f"❌ Missing key in retrieved data: {e}"
    except json.JSONDecodeError:
        return f"❌ Error decoding atomic sites or symmetry operations."

    # Si todo esta bien, se construye la muestra
    try:
        supercell_data = build_sample(unit_cell, atomic_positions, symmetry_operations, size)
        return f"✅ Supercell constructed successfully!\n{supercell_data}"
    except Exception as e:
        return f"❌ Error during supercell construction: {e}"

# Creamos los objetos TOOL para darselos al agente
tools = [
    Tool(name="Search Material", func=search_material_tool, description="Find crystallographic data from ChromaDB."),
    Tool(name="Build Supercell", func=build_supercell_tool, description="Construct a supercell from crystallographic data."),
    
]

#Inicializamos el agente
agent_executor = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors=True,
    agent_kwargs={
        "system_message": (
            "You are a crystallography assistant. ALWAYS follow these steps:\n"
            "1️⃣ If the user requests crystallographic data, use the 'Search Material' tool to find it.\n"
            "2️⃣ Once 'Search Material' provides results, IMMEDIATELY call 'Present Material Info' to display them.\n"
            "3️⃣ If the user requests a supercell, use the 'Build Supercell' tool AFTER retrieving material data.\n"
            "4️⃣ NEVER keep searching if data has already been retrieved. Move to the next action.\n"
            "Always follow this sequence. Do not repeat searches unnecessarily.\n"
            "Every thought should be followed by an action, and every action should dhave it's action inputs.\n"
            "Every action input should be formated as JSON** (e.g., `{'query': 'platinum'}`).\n"
        )
    }
)

# Finciona para llamar al agente
def chatbot_rag_agent(user_query: str):
    """Handles user queries using a LangChain Agent."""
    response = agent_executor.invoke({"input": user_query})
    return response


In [56]:
# ejemplo de uso
user_query = "Search for information about pt and build a sample of 1 unit cell using your tools"
result = chatbot_rag_agent(user_query)
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo find the crystallographic data for 'pt', I should use the `Search Material` function. 

Action: Search Material
Action Input: query="pt"[0m🔍 LLM Extracted: Material_Name, Formula, Space_Group, Supercell_Size
Iron Oxide, Fe2O3, Unknown, 1

Observation: [36;1m[1;3m❌ Error: Invalid structured query format: Material_Name, Formula, Space_Group, Supercell_Size
Iron Oxide, Fe2O3, Unknown, 1[0m
Thought:[32;1m[1;3mThe error message indicates that the `Search Material` function is expecting a more specific query. However, I can see that it's providing some sample data as an example of what the input should look like. Since we're looking for information about 'pt', I'll assume it's a material name.

Action: Search Material
Action Input: query="Material_Name=pt"[0m🔍 LLM Extracted: Based on the given input "query='Material_Name=Pt'", here is the output:

Pt, Pt, P4/mmm, 1

Observation: [36;1m[1;3m❌ Error: Invalid structured qu

KeyboardInterrupt: 