In [1]:
# Check LangChain Version

# !pip install --upgrade langchain
!pip show langchain --version

Name: langchain
Version: 0.2.6
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /home/henock/Desktop/10_Academy/week_11/High-Precision-Legal-Expert-LLM/.venv/lib/python3.10/site-packages
Requires: aiohttp, async-timeout, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-community


In [7]:
import os
import nest_asyncio
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.smith import RunEvalConfig, run_on_dataset
import bs4
from docx import Document as DocxDocument
from langchain import hub
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# To Avoid the Error on Jupyter Notebook (RuntimeError: This Event Loop Is Already Running)
# Patch Asyncio To Allow Nested Event Loops

nest_asyncio.apply()

In [9]:
load_dotenv(find_dotenv())
os.environ["LANGCHAIN_API_KEY"] = str(os.getenv("LANGCHAIN_API_KEY"))
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "default"

In [8]:
# Function to load a .docx document
def load_docx_document(file_path):
    doc = DocxDocument(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

In [10]:
# Set the path to the specific file
# Assuming the script/notebook is in the "notebooks" directory and "data" is a sibling directory
current_working_directory = os.getcwd()
parent_directory = os.path.dirname(current_working_directory)
file_path = os.path.join(parent_directory, "data", "Raptor Contract.docx")

# Load the .docx Document
docx_text = load_docx_document(file_path)

# Convert loaded text to LangChain document format
docs = [Document(page_content=docx_text)]

In [20]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200)

# Make splits
splits = text_splitter.split_documents(docs)

In [12]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [22]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())


retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [23]:
docs = retriever.invoke("Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?")

In [24]:
len(docs)

2

In [16]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'))])

In [17]:
# LLM
client = Client()
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

In [18]:
# Chain
chain = prompt | llm

In [27]:
# Run
chain.invoke({"context":docs,"question":"What is the purpose of the escrow? "})

AIMessage(content='The document does not provide information on the purpose of an escrow.', response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 588, 'total_tokens': 602}, 'model_name': 'gpt-4-0613', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-1c13c6b3-49eb-4eb6-b2cf-4844ce5b3aa9-0', usage_metadata={'input_tokens': 588, 'output_tokens': 14, 'total_tokens': 602})

In [5]:
# Load the LangSmith Client and Test Run

client = Client()

llm = ChatOpenAI()
llm.predict("Hello, world!")

  warn_deprecated(
  warn_deprecated(


'Hello there! How can I assist you today?'

In [6]:
# 1. Create a Dataset (Only Inputs, No Output)

example_inputs = [
    "a rap battle between Atticus Finch and Cicero",
    "a rap battle between Barbie and Oppenheimer",
    "a Pythonic rap battle between two swallows: one European and one African",
    "a rap battle between Aubrey Plaza and Stephen Colbert",
]

dataset_name = "Rap Battle Dataset"

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Rap battle prompts.",
)

for input_prompt in example_inputs:
    # Each example must be unique and have inputs defined.
    # Outputs are optional
    client.create_example(
        inputs={"question": input_prompt},
        outputs=None,
        dataset_id=dataset.id,)