In [3]:
import json
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline

CONTENT_LIST_PATHS = [
    "C:/Coding/VSCode/chatbot/output/abus_elektro-seilzug_programm/auto/abus_elektro-seilzug_programm_content_list.json",
    "C:/Coding/VSCode/chatbot/output/abus_hb-system_programm/auto/abus_hb-system_programm_content_list.json",
    "C:/Coding/VSCode/chatbot/output/abus_laufkran_programm/auto/abus_laufkran_programm_content_list.json"
]
IMAGES_BASE_DIRS = [
    "C:/Coding/VSCode/chatbot/output/abus_elektro-seilzug_programm/auto/images",
    "C:/Coding/VSCode/chatbot/output/abus_hb-system_programm/auto/images",
    "C:/Coding/VSCode/chatbot/output/abus_laufkran_programm/auto/images"
]

# Function to load JSON content from the given paths
def load_content(paths):
    """Load JSON content from the given paths."""
    content = []
    for path in paths:
        try:
            with open(path, "r", encoding="utf-8") as file:
                content.extend(json.load(file))
        except Exception as e:
            print(f"Error loading {path}: {e}")
    return content

# Function to extract texts, page indexes, and image paths from the content list
def extract_content(content_list, image_dirs):
    """Extract texts, page indexes, and image paths from the content list."""
    texts, page_indexes, image_paths = [], [], []
    image_dir_mapping = {os.path.basename(dir): dir for dir in image_dirs}

    for item in content_list:
        if item["type"] == "text" and item["text"].strip():
            texts.append(item["text"].strip())
            image_paths.append(None)
        elif item["type"] == "image":
            caption = " ".join(item.get("img_caption", [])).strip() or "No caption available"
            texts.append(caption)
            img_dir = os.path.dirname(item["img_path"])
            base_dir = image_dir_mapping.get(os.path.basename(img_dir), image_dirs[0])
            image_paths.append(os.path.normpath(os.path.join(base_dir, item["img_path"])))
    return texts, image_paths

# Function to initialize and return a FAISS index with the given embeddings
def initialize_faiss_index(embeddings):
    """Initialize and return a FAISS index with the given embeddings."""
    embedding_dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(embedding_dim)
    index.add(embeddings)
    return index

# Function to search for the query and return an answer or relevant context
def search_and_answer(query, k=5, confidence_threshold=0.01):
    """You are a Helper in a Crane Company, deeply knowledgeable about crane systems. 
    Your task is to assist employees and stakeholders by providing accurate and helpful information based on the given context. 
    Use your expertise to explain, elaborate, and offer actionable insights to address their queries effectively."""
    print("-" * 50)
    print(f"\nSearching for: {query}, retrieving top {k * 2} results")
    
    # Encode the query and search the FAISS index
    query_embedding = embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    distances, indices = index.search(query_embedding, k * 2)
    
    # Retrieve texts
    retrieved_texts = [texts[idx] for idx in indices[0] if idx < len(texts)]
    
    # Rank texts using the reranker
    pairs = [[query, text] for text in retrieved_texts]
    scores = reranker.predict(pairs)
    sorted_texts = [text for _, text in sorted(zip(scores, retrieved_texts), reverse=True)[:k]]
    context = " ".join(sorted_texts)

    # Print retrieved and ranked texts for debugging
    print("\nRetrieved texts:")
    for text in retrieved_texts:
        print(f"- {text}")
    print("\nRanked texts:")
    for text in sorted_texts:
        print(f"- {text}")
    print(f"\nUsing context: {context}\n")
    print("-" * 50)
    print(f"\nQuery: {query}")

    # Get the answer from the QA model
    result = qa_model(question=query, context=context)
    if result.get('score', 0) < confidence_threshold:
        print("Model is unsure about the answer. Returning context instead.")
        return context
    else:
        print(f"Answer: {result['answer']}")
        return result['answer']
    
# Load content
content_list = load_content(CONTENT_LIST_PATHS)
print(f"Loaded {len(content_list)} items from JSON.")

# Extract texts and image paths
texts, image_paths = extract_content(content_list, IMAGES_BASE_DIRS)
print(f"Extracted {len(texts)} texts and {len([p for p in image_paths if p])} images.")

# Initialize models
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
print(f"Computed embeddings shape: {embeddings.shape}")

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
qa_model = pipeline("question-answering", model="deepset/gelectra-base-germanquad")
print("Models loaded successfully.")

# Initialize FAISS index
index = initialize_faiss_index(embeddings)
print(f"FAISS index size: {index.ntotal}\n")

Loaded 584 items from JSON.
Extracted 555 texts and 194 images.
Computed embeddings shape: (555, 384)


Device set to use cpu


Models loaded successfully.
FAISS index size: 555



In [6]:
# Example usage
query = "Was bieten die Elektromagnet-Zweischeibenbremse?"
result = search_and_answer(query)

--------------------------------------------------

Searching for: Was bieten die Elektromagnet-Zweischeibenbremse?, retrieving top 10 results

Retrieved texts:
- Elektromagnet-Zweischeibenbremsen gewährleisten eine Bremsautomatik bei Netzausfall. Umweltfreundliche Bremsbeläge mit Standzeiten von ca. 1 Mio. Schaltungen sorgen für große Wartungsintervalle.
- Die Elektromagnet-Zweischeibenbremse bietet eine Bremsautomatik bei Netzausfall. Asbestfreie Bremsbeläge mit Standzeiten von ca. 1 Mio. Schaltungen verlängern die Wartungsintervalle.
- Elektro-Kettenzüge und Hochleistungskomponenten
- Motorschutzschalter
- SEILEINSCHERUNG DER ABUS ELEKTRO-SEILZÜGE
- TECHNOLOGIE DER ABUS ELEKTRO-SEILZÜGE
- Stützrollen mit Drehgelenk
- ABUS Elektro-Seilzüge GM
- bietet die Möglichkeit, zwei elektrisch gekoppelte Laufkrane zu steuern. Idealerweise werden hierzu ABUS Funksteuerungen eingesetzt. Dabei können wahlweise zwei Kranführer zwei Krane getrennt bedienen oder aber ein Kranführer beide Krane einze