# 1. Set up Pinecone & Langchain

## 1.1 Credentials - get API key

In [None]:
import os
from getpass import getpass

from pinecone import Pinecone
import langchain

In [None]:
# VectorDB
import getpass # For secure password input
import os

from pinecone import Pinecone

os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass(
    "Enter your Pinecone API key: "
)

pinecone_api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

## 1.2 Initialization
### 1.2.1 Check for index - create index

Index is a workspace inside of Pinecone DB - a namespace is a folder inside an index

In [None]:
index_name = "llama-text-embed-v2"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model": "llama-text-embed-v2",
            "field_map": {"text": "text"}
        }
    )
index = pc.Index(index_name)

### 1.2.2 Create Langchain Wrapper for Pinecone Embedding model

Sources:
- [Langchain Huggingface Docs](https://python.langchain.com/api_reference/huggingface/embeddings/langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings.html)
- [Langchain Pinecone Embeddings Docs](https://python.langchain.com/docs/integrations/text_embedding/pinecone/)
- [Langchain Pinecone Docs](https://python.langchain.com/docs/integrations/vectorstores/pinecone/#initialization)
- [Pinecone llama-text-embed-v2 docs](https://docs.pinecone.io/models/llama-text-embed-v2)

In [None]:
from langchain_pinecone import PineconeEmbeddings

embeddings = PineconeEmbeddings(model="llama-text-embed-v2")

### 1.2.3 Init Langchain Vectorstore

In [None]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

delete all data in db before adding new facts & rules

In [None]:
vector_store._index.delete(delete_all=True)

Every Document has:
- .page_content → the text
- .metadata → a dictionary

Note:

Facts & Rules sind ChatGPT generiert mit folgendem Prompt:

"give me 5 facts and 5 rules about siemens and a prompt for testing"

In [7]:
from uuid import uuid4

from langchain_core.documents import Document

# --- Facts ---
document_1 = Document(
    page_content="Siemens was founded in 1847 by Werner von Siemens in Berlin, Germany.",
    metadata={"source": "facts"},
)

document_2 = Document(
    page_content="Siemens is a global leader in electrification, automation, and digitalization solutions.",
    metadata={"source": "facts"},
)

document_3 = Document(
    page_content="Siemens operates in more than 190 countries worldwide.",
    metadata={"source": "facts"},
)

document_4 = Document(
    page_content="Siemens Energy, a spin-off from Siemens AG, focuses on power generation and transmission.",
    metadata={"source": "facts"},
)

document_5 = Document(
    page_content="The Siemens logo is one of the most recognizable industrial logos in the world.",
    metadata={"source": "facts"},
)

# --- Rules ---
document_6 = Document(
    page_content="If a company provides automation and digitalization solutions, then it supports Industry 4.0.",
    metadata={"source": "rules"},
)

document_7 = Document(
    page_content="If a firm has operations in more than 100 countries, then it is considered a multinational corporation.",
    metadata={"source": "rules"},
)

document_8 = Document(
    page_content="If a business is a leader in electrification, then it contributes to energy transition and grid modernization.",
    metadata={"source": "rules"},
)

document_9 = Document(
    page_content="If a company spins off a specialized division, then that division operates as an independent entity.",
    metadata={"source": "rules"},
)

document_10 = Document(
    page_content="If a brand is widely recognized worldwide, then it has strong global brand equity.",
    metadata={"source": "rules"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)

['f97ca86b-86e6-46c9-91f5-81f0758a1af6',
 '68971698-ea1c-436c-b456-92a4eb924897',
 '116f6a70-8431-4f21-9bfd-303cd1c910ae',
 '4c66da4d-6760-4778-b419-9eb2eafb81d7',
 'a76c873b-7f47-4af8-9f6c-87dee57cc85a',
 '70deaecd-9358-49a9-91b2-b1ab544d4f6b',
 '607758eb-ff4c-4ecc-b31d-174a8977a0b1',
 'de14207b-b70e-4819-b204-8a14f4b5b1db',
 '47c4861d-8851-4055-baf2-e74fbb25572c',
 'd892e8db-9bcf-46ec-a873-321116a5d476']

In [8]:
prompt = "Why is Siemens considered an important player in the global energy transition?"

### 1.2.4 Test Data & Test code - Ignore whole SECTION while run

In [9]:
# docs = [
#     "Apple is a popular fruit known for its sweetness and crisp texture.",
#     "The tech company Apple is known for its innovative products like the iPhone.",
#     "Many people enjoy eating apples as a healthy snack.",
#     "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces.",
#     "An apple a day keeps the doctor away, as the saying goes.",
# ]

In [10]:
# doc_embeds = embeddings.embed_documents(docs)
# doc_embeds

In [11]:
# query = "Tell me about the tech company known as Apple"
# query_embed = embeddings.embed_query(query)
# query_embed

### 1.2.5 Similarity Search
#### 1.2.5.1 Test - Ignore while run

In [12]:
# results = vector_store.similarity_search(
#     "LangChain provides abstractions to make working with LLMs easy",
#     k=2,
#     filter={"source": "tweet"},
# )
# for res in results:
#     print(f"* {res.page_content} [{res.metadata}]")

#### 1.2.5.2 Implementation

In [13]:
relevant_docs = vector_store.similarity_search(
    prompt,
    k = 4,
)

In [14]:
print("Relevant documents found:", relevant_docs)

Relevant documents found: [Document(id='d5c591ea-5eaa-4e6e-87aa-862c3623b565', metadata={'source': 'facts'}, page_content='Siemens is a global leader in electrification, automation, and digitalization solutions.'), Document(id='70d8aa41-a231-4d8e-9db5-08ca0161f5a3', metadata={'source': 'rules'}, page_content='If a business is a leader in electrification, then it contributes to energy transition and grid modernization.'), Document(id='c9d2f78b-c62c-48a1-8a51-bee5b0dcd37d', metadata={'source': 'facts'}, page_content='Siemens Energy, a spin-off from Siemens AG, focuses on power generation and transmission.'), Document(id='80c74245-6cf3-484a-8321-b9d7b0e937c5', metadata={'source': 'facts'}, page_content='The Siemens logo is one of the most recognizable industrial logos in the world.')]


In [15]:
facts = [doc for doc in relevant_docs if doc.metadata.get("source") == "facts"]
rules = [doc for doc in relevant_docs if doc.metadata.get("source") == "rules"]

In [16]:
facts_str = "\n".join([f"- {doc.page_content}" for doc in facts])
rules_str = "\n".join([f"- {doc.page_content}" for doc in rules])

# 2. Gemini LLM

## 2.1 Init Gemini LLM
Sources:
- [Langchain LLM](https://python.langchain.com/api_reference/core/language_models/langchain_core.language_models.llms.LLM.html)
- [Google AI Langchain LLM](https://python.langchain.com/docs/integrations/providers/google/#llms)

In [17]:
from langchain_google_genai import GoogleGenerativeAI

llm = GoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key= os.getenv("GOOGLE_API_KEY")
    )

E0000 00:00:1758760839.504238 8159005 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


## 2.2 Prompt Template
Sources:
- [Prompt Template LC](https://python.langchain.com/docs/concepts/prompt_templates/)

In [18]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template(
    """Based only on the following retrieved relevant facts:
        {facts}
    
        and rules:
        {rules}

        answer the following question:
        {prompt}
    """
)

message = prompt_template.invoke({
    "facts": facts_str,
    "rules": rules_str,
    "prompt": prompt
    })

In [19]:
response = llm.invoke(message)

In [20]:
print(relevant_docs, "\n")
print("Facts used:\n", facts_str, "\n")
print("Rules used:\n", rules_str, "\n")
print("Prompt:\n", prompt, "\n")

print("Final Response from LLM:", response)

[Document(id='d5c591ea-5eaa-4e6e-87aa-862c3623b565', metadata={'source': 'facts'}, page_content='Siemens is a global leader in electrification, automation, and digitalization solutions.'), Document(id='70d8aa41-a231-4d8e-9db5-08ca0161f5a3', metadata={'source': 'rules'}, page_content='If a business is a leader in electrification, then it contributes to energy transition and grid modernization.'), Document(id='c9d2f78b-c62c-48a1-8a51-bee5b0dcd37d', metadata={'source': 'facts'}, page_content='Siemens Energy, a spin-off from Siemens AG, focuses on power generation and transmission.'), Document(id='80c74245-6cf3-484a-8321-b9d7b0e937c5', metadata={'source': 'facts'}, page_content='The Siemens logo is one of the most recognizable industrial logos in the world.')] 

Facts used:
 - Siemens is a global leader in electrification, automation, and digitalization solutions.
- Siemens Energy, a spin-off from Siemens AG, focuses on power generation and transmission.
- The Siemens logo is one of the mo