<a href="https://colab.research.google.com/github/jdmartinezrs/RAG-system-using-LangChain/blob/main/ragfuncionalPersistenciaDatos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio langchain langchain-community chromadb sentence-transformers transformers pypdf pandas openpyxl python-docx




In [16]:
import gradio as gr
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.document_loaders import UnstructuredExcelLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

import os
import shutil
from google.colab import drive #NUEVO
drive.mount('/content/drive') #NUEVO
uploaded_files = []
# Crear carpetas
base_dir = "/content/drive/MyDrive/asistente_mantenimiento"  # puedes cambiar el nombre
persist_directory = os.path.join(base_dir, "chroma_index")
docs_directory = os.path.join(base_dir, "uploaded_docs")

os.makedirs(persist_directory, exist_ok=True)
os.makedirs(docs_directory, exist_ok=True)


# Embeddings
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# VectorDB
if os.path.exists(persist_directory):
    # Load existing Chroma database if the directory exists
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
else:
    # Initialize an empty Chroma database if the directory doesn't exist
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

retriever = vectordb.as_retriever()

# Cargar modelo de lenguaje
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)

# Cadena QA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Device set to use cpu


In [17]:
import os
import shutil
from langchain.document_loaders import PyPDFLoader, UnstructuredExcelLoader, UnstructuredWordDocumentLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

uploaded_files = []  # Lista global para llevar control de archivos subidos
docs_directory = "uploaded_docs"  # Carpeta donde guardas documentos procesados

def process_file(file):
    if file is None:
        return "❌ No se subió ningún archivo.", "\n".join(uploaded_files)

    try:
        filename = os.path.basename(file.name)
        ext = os.path.splitext(filename)[1].lower()
        local_path = os.path.join(docs_directory, filename)

        # Crear carpeta si no existe
        os.makedirs(docs_directory, exist_ok=True)

        # Copiar archivo localmente
        shutil.copy(file.name, local_path)

        # Selección del loader según extensión
        if ext == ".pdf":
            loader = PyPDFLoader(local_path)
        elif ext in [".xlsx", ".xlsm"]:
            loader = UnstructuredExcelLoader(local_path)
        elif ext == ".docx":
            loader = UnstructuredWordDocumentLoader(local_path)
        elif ext == ".txt":
            loader = TextLoader(local_path, encoding="utf-8")
        else:
            return f"❌ Tipo de archivo no soportado: {ext}", "\n".join(uploaded_files)

        # Cargar documentos y dividirlos
        docs = loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
        chunks = splitter.split_documents(docs)

        # Añadir documentos al vector DB y persistir
        vectordb.add_documents(chunks)
        vectordb.persist()

        # Guardar nombre en lista de archivos subidos
        if filename not in uploaded_files:
            uploaded_files.append(filename)

        return f"✅ Archivo procesado: {filename}", "\n".join(uploaded_files)

    except Exception as e:
        return f"❌ Error al procesar el archivo: {str(e)}", "\n".join(uploaded_files)


def ask_question(question):
    response = qa_chain.invoke({"query": question})
    result = response["result"]
    sources = "\n".join(set(doc.metadata.get("source", "Desconocido") for doc in response["source_documents"]))
    return f"📌 **Respuesta**:\n{result}\n\n📚 **Fuentes**:\n{sources}"


In [18]:
with gr.Blocks(title="Asistente de Mantenimiento Inteligente") as demo:
    gr.Markdown("# 🤖 Asistente de Mantenimiento Inteligente\nSube documentos y haz preguntas sobre mantenimiento preventivo.")

    with gr.Tab("📤 Subir documento"):
        file_input = gr.File(label="Sube un archivo", file_types=[".pdf", ".xlsx", ".xlsm", ".txt", ".docx"])
        upload_btn = gr.Button("Procesar y guardar")
        output_upload = gr.Textbox(label="Estado de carga")
        uploaded_list = gr.Textbox(label="Archivos subidos", interactive=False, lines=10)

    with gr.Tab("💬 Hacer una pregunta"):
        question_input = gr.Textbox(label="Escribe tu pregunta")
        ask_btn = gr.Button("Responder")
        answer_output = gr.Markdown(label="Respuesta")

    upload_btn.click(fn=process_file, inputs=file_input, outputs=[output_upload, uploaded_list])
    ask_btn.click(fn=ask_question, inputs=question_input, outputs=answer_output)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://442ae4096cac033cda.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [11]:
vectordb.delete_collection()


In [14]:
import os

# Mostrar los archivos en la carpeta chroma_index
for root, dirs, files in os.walk("/content/chroma_index"):
    for name in files:
        print(os.path.join(root, name))

In [15]:
print(vectordb._collection.count())


NotFoundError: Collection [bdfdfb26-9957-450b-bd16-e84ba206627f] does not exists.