In [16]:
import os
from dotenv import load_dotenv

# Load all environment variables from .env file
load_dotenv()

## LLM
openai_api_key = os.getenv('OPENAI_API_KEY')

## Pinecone Vector Database
pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [17]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)


In [18]:
import time

index_name = "rag-setp-back-index" # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [19]:
# Load blog
import bs4
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone
from pprint import pprint

#### INDEXING ####

# Load Document (Uploading one file at a time)
pdf_file_path = "./data/langchain_turing.pdf"
loader = PyPDFLoader(pdf_file_path)

docs = loader.load()

# Upload muiltiple PDF files from a directory
# pdf_file_paths = <enter your path here>
# loader = PyPDFDirectoryLoader(pdf_file_paths)

# docs_dir = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=2000, 
    chunk_overlap=500)

# Make splits
splits = text_splitter.split_documents(docs)

# Index
vectorstore = Pinecone.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"), 
    index_name=index_name
)


In [20]:
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 5, "score_threshold": 0.5},
)

# Step Back

![Step_Back](./images/rag_step_back.png)

Link -https://arxiv.org/pdf/2310.06117

In [21]:
from pydantic import BaseModel, Field

class StepBackQuestion(BaseModel):
    question: str= Field(description="The question derived by stepping back and reprahsing")

In [22]:
# Few Shot Examples
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_openai import ChatOpenAI

def step_back_function(question):
    examples = [
        {
            "input": "Could the members of The Police perform lawful arrests?",
            "output": "what can the members of The Police do?",
        },
        {
            "input": "Jan Sindel’s was born in what country?",
            "output": "what is Jan Sindel’s personal history?",
        },
    ]
    # We now transform these to example messages
    example_prompt = ChatPromptTemplate.from_messages(
        [
            ("human", "{input}"),
            ("ai", "{output}"),
        ]
    )
    few_shot_prompt = FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=examples,
    )
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:""",
            ),
            # Few shot examples
            few_shot_prompt,
            # New question
            ("user", "{question}"),
        ]
    )

    llm = ChatOpenAI(model = "gpt-4o-mini", temperature=1)

    step_back_question = llm.with_structured_output(StepBackQuestion).invoke(prompt.format(question = question))

    return step_back_question.question


In [23]:
question = "How does LangChain ensure security when integrating external services like vector databases and API providers in LLM applications?"
step_back_question = step_back_function(question)
step_back_question

'What measures are taken to ensure security when integrating external services in applications?'

In [24]:

def generate_response(question):

    # Response prompt 
    response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

    # {normal_context}
    # {step_back_context}

    # Original Question: {question}
    # Answer:"""

    step_back_question = step_back_question = step_back_function(question)

    normal_context = retriever.invoke(question)
    step_back_context = retriever.invoke(step_back_question)
    
    llm = ChatOpenAI(model = "gpt-4o-mini", temperature=1)

    response = llm.invoke(response_prompt_template.format(question = question, normal_context = normal_context, step_back_context = step_back_context))

    return response.content

In [25]:
question = "How does LangChain ensure security when integrating external services like vector databases and API providers in LLM applications?"

answer = generate_response(question)

In [26]:
from IPython.display import Markdown
Markdown(answer)

LangChain addresses security concerns related to its integration with external services—such as vector databases and API providers—through a multi-faceted approach that includes several best practices and internal controls:

1. **Granular Permissions**: LangChain adopts the principle of least privilege, allowing developers to set specific permissions for various components of their applications. This minimizes the risk of unauthorized access or actions within the application. By restricting access based on user roles and application context, it enhances security when interacting with external services.

2. **Sandboxing**: To mitigate the risks associated with exposing sensitive data to external providers, LangChain employs sandboxed environments. This layered security measure helps isolate interactions, ensuring that even if a vulnerability occurs, the impact on the overall application and sensitive data is limited.

3. **Data Encryption**: Although the document emphasizes the need for stringent encryption, it suggests that LangChain includes basic data security measures. For applications handling particularly sensitive information, it advocates for advanced encryption methodologies, such as end-to-end encryption and field-level encryption, to safeguard data even within trusted environments.

4. **Auditability and Monitoring**: Using tools like LangSmith, LangChain provides detailed logging and monitoring capabilities. This feature allows developers to track application usage and detect anomalies in real-time, which is crucial for identifying potential security breaches early.

5. **Proactive Security Analytics**: The framework can integrate predictive analytics to identify potential risks before they materialize. By analyzing application logs and monitoring for unusual patterns, machine learning models can flag anomalies indicative of security issues, enabling proactive responses to threats.

6. **External Provider Vetting**: Given the reliance on third-party services, LangChain emphasizes the importance of thoroughly vetting these providers. This includes evaluating their security protocols and continuously monitoring their infrastructures to detect any vulnerabilities or breaches that could impact LangChain applications.

Overall, while LangChain has established security measures to manage the complexities arising from external integrations, ongoing improvements—such as dynamic permission adjustments during application runtime and enhanced encryption practices—are suggested to bolster security further, especially in compliance-sensitive sectors like finance and healthcare.