In [35]:
from pathlib import Path
from langchain.document_loaders import Docx2txtLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)  # recommended text splitter by langchainfrom langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [36]:
load_dotenv()

True

# Reduced data

 Trying with first a very reduced version of 2015. 
This version has an example of text that does not exist in 2023 (1-Click) and text that does (Family Sharing). 
Aim is to protoype the process and prompting on this small set before extending to the entire dataset


# Load documents

In [37]:
REDUCED_DATA_DIR = Path("data", "reduced")
toc_2015_fname = Path(REDUCED_DATA_DIR, "Jan 2015_reduced.docx")
toc_2023_fname = Path(REDUCED_DATA_DIR, "Mar 2023.docx")

In [38]:
loader_2015 = Docx2txtLoader(str(toc_2015_fname))  # str reqd for loader
data_2015 = loader_2015.load()
data_2015



In [39]:
loader_2023 = Docx2txtLoader(str(toc_2023_fname))
data_2023 = loader_2023.load()

data_2023

[Document(page_content='Apple Media Services Terms and Conditions\n\nThese terms and conditions create a contract between you and Apple (the “Agreement”). Please read the Agreement carefully.\n\nA. INTRODUCTION TO OUR SERVICES\n\nThis Agreement governs your use of Apple’s Services (“Services” – e.g., and where available, App Store, Apple Arcade, Apple Books, Apple Fitness+, Apple Music, Apple News, Apple News+, Apple One, Apple Podcasts, Apple Podcasts Subscriptions, Apple TV, Apple TV+, Apple TV Channels, Game Center, iTunes), through which you can buy, get, license, rent or subscribe to content, Apps (as defined below), and other in-app services (collectively, “Content”). Content may be offered through the Services by Apple or a third party. Our Services are available for your use in your country or territory of residence (“Home Country”). By creating an account for use of the Services in a particular country or territory you are specifying it as your Home Country. To use our Service

# create embeddings

In [40]:
"""
Info on chunk size and chunk overlap here
https://github.com/langchain-ai/langchain/issues/2026

GPT4 turbo has a token length of 4k, and since I dont need many return tokens, I can 
set the chunk size to quite large, 
"""
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=20,
)

In [41]:
texts_2015 = text_splitter.split_documents(data_2015)
texts_2023 = text_splitter.split_documents(data_2023)

for idx, text in enumerate(texts_2015):
    texts_2015[idx].metadata["year"] = 2015
    texts_2015[idx].metadata["sources"] = 2015
    texts_2015[idx].metadata["doc_source"] = str(idx)
for idx, text in enumerate(texts_2023):
    texts_2023[idx].metadata["year"] = 2023
    texts_2023[idx].metadata["sources"] = 2015
    texts_2023[idx].metadata["doc_source"] = str(idx)

In [42]:
texts = texts_2015 + texts_2023

In [43]:
len(texts_2015), len(texts_2023)

(6, 33)

In [44]:
len(texts)

39

In [45]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    collection_name="toc_collection",
    persist_directory="chroma_db",
)

# Langchain document comparison

In [60]:
# from pydantic import Field
from pydantic.v1 import BaseModel, Field

from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA

In [61]:
class DocumentInput(BaseModel):
    question: str = Field()


llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")


REDUCED_DATA_DIR = Path("data", "reduced")

tools = []
files = [
    {
        "name": "2015",
        "path": str(toc_2015_fname),
    },
    {
        "name": "2023",
        "path": str(toc_2023_fname),
    },
]


for file in files:
    loader = Docx2txtLoader(file["path"])
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=20,
    )
    docs = text_splitter.split_documents(pages)
    embeddings = OpenAIEmbeddings()

    retriever = Chroma.from_documents(
        documents=texts,
        embedding=embeddings,
        collection_name="toc_collection",
        persist_directory="chroma_db",
    ).as_retriever()

    # Wrap retrievers in a Tool
    tools.append(
        Tool(
            args_schema=DocumentInput,
            name=file["name"],
            description=f"useful when you want to answer questions about {file['name']}",
            func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever),
        )
    )

In [62]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType

In [72]:
query_text = texts_2015[0].page_content

In [75]:
llm = ChatOpenAI(
    temperature=0,
    model="gpt-3.5-turbo-0613",
)

agent = initialize_agent(
    agent=AgentType.OPENAI_MULTI_FUNCTIONS, tools=tools, llm=llm, verbose=True
)

agent(
    {"input": f"does the following information exist in the 2023 document {query_text}"}
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `2023` with `{'question': 'Does the 2023 document contain the information about A. ITUNES STORE, MAC APP STORE, APP STORE, AND IBOOKS STORE TERMS OF SALE?'}`


[0m[33;1m[1;3m{'query': 'Does the 2023 document contain the information about A. ITUNES STORE, MAC APP STORE, APP STORE, AND IBOOKS STORE TERMS OF SALE?', 'result': 'Yes, the 2023 document contains information about A. ITUNES STORE, MAC APP STORE, APP STORE, AND IBOOKS STORE TERMS OF SALE.'}[0m[32;1m[1;3m
Invoking: `2023` with `{'question': 'Does the 2023 document contain the information about B. ITUNES STORE TERMS AND CONDITIONS?'}`


[0m[33;1m[1;3m{'query': 'Does the 2023 document contain the information about B. ITUNES STORE TERMS AND CONDITIONS?', 'result': 'Based on the provided context, there is no specific mention of the "B. iTunes Store Terms and Conditions" in the given information. Therefore, it is not possible to determine if the 2023 doc

KeyboardInterrupt: 