In [None]:
import os
from dotenv import load_dotenv

# Load all environment variables from .env file
load_dotenv()

## LLM
openai_api_key = os.getenv('OPENAI_API_KEY')

## Pinecone Vector Database
pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [2]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)


  from tqdm.autonotebook import tqdm


In [3]:
import time

index_name = "rag-setp-back-index" # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [4]:
# Load blog
import bs4
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone
from pprint import pprint

#### INDEXING ####

# Load Document (Uploading one file at a time)
pdf_file_path = "./data/langchain_turing.pdf"
loader = PyPDFLoader(pdf_file_path)

docs = loader.load()

# Upload muiltiple PDF files from a directory
# pdf_file_paths = <enter your path here>
# loader = PyPDFDirectoryLoader(pdf_file_paths)

# docs_dir = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=2000, 
    chunk_overlap=500)

# Make splits
splits = text_splitter.split_documents(docs)

# Index
vectorstore = Pinecone.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"), 
    index_name=index_name
)


In [11]:
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 5, "score_threshold": 0.5},
)

# Step Back

![Step_Back](./images/rag_step_back.png)

Link -https://arxiv.org/pdf/2310.06117

In [7]:
from pydantic import BaseModel, Field

class StepBackQuestion(BaseModel):
    question: str= Field(description="The question derived by stepping back and reprahsing")

In [8]:
# Few Shot Examples
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_openai import ChatOpenAI

def step_back_function(question):
    examples = [
        {
            "input": "Could the members of The Police perform lawful arrests?",
            "output": "what can the members of The Police do?",
        },
        {
            "input": "Jan Sindel’s was born in what country?",
            "output": "what is Jan Sindel’s personal history?",
        },
    ]
    # We now transform these to example messages
    example_prompt = ChatPromptTemplate.from_messages(
        [
            ("human", "{input}"),
            ("ai", "{output}"),
        ]
    )
    few_shot_prompt = FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=examples,
    )
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:""",
            ),
            # Few shot examples
            few_shot_prompt,
            # New question
            ("user", "{question}"),
        ]
    )

    llm = ChatOpenAI(model = "gpt-4o-mini", temperature=1)

    step_back_question = llm.with_structured_output(StepBackQuestion).invoke(prompt.format(question = question))

    return step_back_question.question


In [10]:
question = "How does LangChain ensure security when integrating external services like vector databases and API providers in LLM applications?"
step_back_question = step_back_function(question)
step_back_question

'how does LangChain manage security in its integrations?'

In [None]:

def generate_response(question):

    # Response prompt 
    response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

    # {normal_context}
    # {step_back_context}

    # Original Question: {question}
    # Answer:"""

    step_back_question = step_back_question = step_back_function(question)

    normal_context = retriever.invoke(question)
    step_back_context = retriever.invoke(step_back_question)
    
    llm = ChatOpenAI(model = "gpt-4o-mini", temperature=1)

    response = llm.invoke(response_prompt_template.format(question = question, normal_context = normal_context, step_back_context = step_back_context))

    return response.content

In [14]:
question = "How does LangChain ensure security when integrating external services like vector databases and API providers in LLM applications?"

answer = generate_response(question)

In [15]:
from IPython.display import Markdown
Markdown(answer)

LangChain incorporates several security measures to address the risks associated with integrating external services such as vector databases and API providers. Here are the key aspects of its security framework:

1. **Granular Permissions**: LangChain enforces the principle of least privilege by allowing developers to set specific permissions. This minimizes the risk of unauthorized actions by limiting what external services can access or modify within the application. By configuring permissions carefully, developers can protect sensitive data and functionality.

2. **Sandboxing**: LangChain employs sandboxing techniques to create isolated environments for executing code and accessing external services. This approach limits the exposure of sensitive information and reduces the risk of security vulnerabilities that could arise from integrating third-party services.

3. **Defense in Depth**: The architecture includes multiple layers of security controls, ensuring that even if one layer is bypassed, others still provide protection. This layered security strategy strengthens the overall resilience of LLM applications against potential attacks.

4. **Data Encryption**: For applications handling sensitive data, LangChain advocates for the implementation of robust encryption standards, such as end-to-end encryption. This practice helps ensure that data remains secure while being transmitted to and from external services, reducing the risk of data exposure.

5. **Auditability and Monitoring**: LangChain features monitoring capabilities through components like LangSmith, which offers logging and tracking of application usage. This enables developers to detect anomalies and track interactions with external services. Real-time monitoring can help identify potential security breaches or misuse of the application.

6. **Compliance Considerations**: Recognizing that certain sectors, such as finance and healthcare, have rigorous compliance requirements, LangChain addresses ongoing security challenges by promoting practices such as dynamic permission adjustments—where permission settings can adapt based on user interactions and roles. This is particularly crucial in contexts where compliance with data protection regulations is mandatory.

7. **Proactive Security Analytics**: The integration of predictive analytics can also enhance security by identifying risks before they manifest as actual breaches. Utilizing machine learning models to analyze application logs allows developers to flag suspicious activities or vulnerabilities early in the process.

In conclusion, while LangChain provides a comprehensive security model through these mechanisms, the inherent risks associated with relying on external providers necessitate continued vigilance and improvement in security practices to safeguard sensitive data within LLM applications.