# Plan

1. Use GPT to summarise full doc with map-reduce method
2. Use Davinci to summarise full doc with map-reduce method
3. Use GPT to summarise sections by title
4. Try Vector Summarisation

In [178]:
from pathlib import Path
from dotenv import load_dotenv
from langchain.document_loaders import Docx2txtLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from typing import Optional

In [2]:
load_dotenv()

True

# Load Documents

In [6]:
SUMMARY_SAVE_DIR = Path("data", "summaries")

In [3]:
DATA_DIR = Path("data", "raw")
toc_2015_fname = Path(DATA_DIR, "Jan 2015.docx")
toc_2023_fname = Path(DATA_DIR, "Mar 2023.docx")

In [5]:
loader_2015 = Docx2txtLoader(str(toc_2015_fname))  # str reqd for loader
data_2015 = loader_2015.load()
loader_2023 = Docx2txtLoader(str(toc_2023_fname))
data_2023 = loader_2023.load()

# Whole Document - GPT

In [8]:
gpt4 = ChatOpenAI(model_name="gpt-4")
# gpt4 has up to 8,192 tokens

## Map Reduce Template

In [12]:
# Map
map_template = """
Write a concise summary of the following:
"{docs}"
CONCISE SUMMARY:
"""

# Reduce
reduce_template = """Write a concise summary of the following text delimited by triple backquotes.
Return your response in bullet points which covers the key points of the text.
```{doc_summaries}```
BULLET POINT SUMMARY:"""

## Map Reduce Chain

In [14]:
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=gpt4, prompt=map_prompt)

# Run chain
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=gpt4, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4000, chunk_overlap=0
)

In [17]:
split_2015 = text_splitter.split_documents(data_2015)
split_2023 = text_splitter.split_documents(data_2023)

In [23]:
token_lengths_2015 = [
    gpt4.get_num_tokens(split_2015[i].page_content) for i in range(len(split_2015))
]

token_lengths_2023 = [
    gpt4.get_num_tokens(split_2023[i].page_content) for i in range(len(split_2023))
]

In [24]:
print(
    f"2015 T&Cs has been split into {len(split_2015)} docs. Made up of {token_lengths_2015} length tokens."
)
print(
    f"2023 T&Cs has been split into {len(split_2023)} docs.Made up of {token_lengths_2023} length tokens."
)

2015 T&Cs has been split into 5 docs. Made up of [3693, 3626, 3732, 3771, 3738] length tokens.
2023 T&Cs has been split into 3 docs.Made up of [3601, 3479, 3199] length tokens.


In [25]:
output_map_reduce_15 = map_reduce_chain.run(split_2015)
output_map_reduce_23 = map_reduce_chain.run(split_2023)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-1ivxXVFNax3ENVHReVeOJ0Iz on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-1ivxXVFNax3ENVHReVeOJ0Iz on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-1ivxXVFNax3ENVHReVeOJ0Iz on tokens per min. Limit: 10000 / m

In [31]:
with open(Path(SUMMARY_SAVE_DIR, "gpt4_map_reduce_summarized_2015.txt"), "w") as f:
    f.write(output_map_reduce_15)
with open(Path(SUMMARY_SAVE_DIR, "gpt4_map_reduce_summarized_2023.txt"), "w") as f:
    f.write(output_map_reduce_23)

In [30]:
print(f"2015 map-reduce summary contains {len(output_map_reduce_15.split())} words")
print(f"2023 map-reduce summary contains {len(output_map_reduce_23.split())} words")

2015 map-reduce summary contains 328 words
2023 map-reduce summary contains 280 words


# Whole Document - Davinci

In [210]:
davinci = OpenAI(temperature=0, max_tokens=6000, model="davinci-002")
# Davinci allows up to 16,384 tokens
# Can also try text-davnci-003 but need to adjust token count

In [40]:
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=davinci, prompt=map_prompt)

# Run chain
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=davinci, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=6000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=6000, chunk_overlap=0
)

Going to recreate the chain here, because davinci allows up to 16k tokens

In [41]:
output_map_reduce_davinci_15 = map_reduce_chain.run(split_2015)
output_map_reduce_davinci_23 = map_reduce_chain.run(split_2023)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-1ivxXVFNax3ENVHReVeOJ0Iz on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-1ivxXVFNax3ENVHReVeOJ0Iz on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4 in organization org-1ivxXVFNax3ENVHReVeOJ0Iz on tokens per min. Limit: 10000 / m

In [42]:
with open(Path(SUMMARY_SAVE_DIR, "davinci_map_reduce_summarized_2015.txt"), "w") as f:
    f.write(output_map_reduce_davinci_15)
with open(Path(SUMMARY_SAVE_DIR, "davinci_map_reduce_summarized_2023.txt"), "w") as f:
    f.write(output_map_reduce_davinci_23)

In [43]:
print(
    f"2015 davinci map-reduce summary contains {len(output_map_reduce_davinci_15.split())} words"
)
print(
    f"2023 davinci map-reduce summary contains {len(output_map_reduce_davinci_23.split())} words"
)

2015 davinci map-reduce summary contains 360 words
2023 davinci map-reduce summary contains 261 words


# Summarising by section

Plan here is to manually split the documnets into sections that make sense, and have the LLM summarise each section - in the hopes that more of the relevent key information is maintained.

## Splitting docs by sections

In [110]:
raw_text_2015 = data_2015[0].page_content
raw_text_2023 = data_2023[0].page_content

In [111]:
print(f"2015 T&S contain {len(raw_text_2015.split())} words")
print(f"2023 T&S contain {len(raw_text_2023.split())} words")

2015 T&S contain 15399 words
2023 T&S contain 8386 words


## GPT

### 2015 Sections

In [112]:
delimiters_2015 = [
    "A. ITUNES STORE, MAC APP STORE, APP STORE AND IBOOKS STORE TERMS OF SALE",
    "B. ITUNES STORE TERMS AND CONDITIONS",
    "C. MAC APP STORE, APP STORE AND IBOOKS STORE TERMS AND CONDITIONS",
]

In [143]:
raw_text_2015 = raw_text_2015[
    200:
]  # removing the initial text as this matches the delimiters

In [175]:
def string_between_substrings(input_str: str, start: str, end: Optional[str]):
    if end is None:
        return (input_str.split(start))[1]
    return (input_str.split(start))[1].split(end)[0]

In [176]:
a_section = string_between_substrings(
    raw_text_2015, delimiters_2015[0], delimiters_2015[1]
)

b_section = string_between_substrings(
    raw_text_2015, delimiters_2015[1], delimiters_2015[2]
)

c_section = string_between_substrings(raw_text_2015, delimiters_2015[2], None)

In [145]:
print(
    f"""A Section length: {len(a_section.split())} words,
      B Section length: {len(b_section.split())} words,
      C Section length: {len(c_section.split())} words """
)

A Section length: 2280 words,
      B Section length: 5186 words,
      C Section length: 7767 words 


In [147]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4000, chunk_overlap=0
)

In [152]:
a_doc_2015 = text_splitter.create_documents([a_section])
b_doc_2015 = text_splitter.create_documents([b_section])
c_doc_2015 = text_splitter.create_documents([c_section])

a_split_2015 = text_splitter.split_documents(a_doc_2015)
b_split_2015 = text_splitter.split_documents(b_doc_2015)
c_split_2015 = text_splitter.split_documents(c_doc_2015)

In [None]:
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=gpt4, prompt=map_prompt)

# Run chain
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=gpt4, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4000, chunk_overlap=0
)

In [153]:
output_map_reduce_15_a = map_reduce_chain.run(a_split_2015)

In [155]:
output_map_reduce_15_b = map_reduce_chain.run(b_split_2015)

In [156]:
output_map_reduce_15_c = map_reduce_chain.run(c_split_2015)

In [157]:
sections_2015_summary = (
    output_map_reduce_15_a
    + "\n"
    + output_map_reduce_15_b
    + "\n"
    + output_map_reduce_15_c
)

In [158]:
with open(Path(SUMMARY_SAVE_DIR, "2015_sections_summary.txt"), "w") as f:
    f.write(sections_2015_summary)

### 2023 Sections

In [192]:
delimiters_2023 = [
    "A. INTRODUCTION TO OUR SERVICES",
    # "B. USING OUR SERVICES",
    "C. YOUR SUBMISSIONS TO OUR SERVICES",
    # "D. FAMILY SHARING",
    # "E. SERIES PASS AND MULTI-PASS",
    # "F. ADDITIONAL APP STORE TERMS (EXCLUDING APPLE ARCADE APPS)",
    "G. ADDITIONAL TERMS FOR CONTENT ACQUIRED FROM THIRD PARTIES",
    # "H. ADDITIONAL APPLE MUSIC TERMS",
    # "I. ADDITIONAL APPLE FITNESS+ TERMS",
    # "J. CARRIER MEMBERSHIP",
    # "K. MISCELLANEOUS TERMS APPLICABLE TO ALL SERVICES",
]
# I've manually split these into lengths of around 2500 - 3000 words

In [187]:
delimiters_2023

['A. INTRODUCTION TO OUR SERVICES',
 'C. YOUR SUBMISSIONS TO OUR SERVICES',
 'G. ADDITIONAL TERMS FOR CONTENT ACQUIRED FROM THIRD PARTIES']

In [190]:
a_section_23 = string_between_substrings(
    raw_text_2023, delimiters_2023[0], delimiters_2023[1]
)

b_section_23 = string_between_substrings(
    raw_text_2023, delimiters_2023[1], delimiters_2023[2]
)

c_section_23 = string_between_substrings(raw_text_2023, delimiters_2023[2], None)

In [191]:
print(
    f"""A Section length: {len(a_section_23.split())} words,
      B Section length: {len(b_section_23.split())} words,
      C Section length: {len(c_section_23.split())} words """
)

A Section length: 2654 words,
      B Section length: 2996 words,
      C Section length: 2692 words 


In [193]:
a_doc_2023 = text_splitter.create_documents([a_section_23])
b_doc_2023 = text_splitter.create_documents([b_section_23])
c_doc_2023 = text_splitter.create_documents([c_section_23])

a_split_2023 = text_splitter.split_documents(a_doc_2023)
b_split_2023 = text_splitter.split_documents(b_doc_2023)
c_split_2023 = text_splitter.split_documents(c_doc_2023)

In [194]:
output_map_reduce_23_a = map_reduce_chain.run(a_split_2023)
output_map_reduce_23_b = map_reduce_chain.run(b_split_2023)
output_map_reduce_23_c = map_reduce_chain.run(c_split_2023)

In [195]:
sections_2023_summary = (
    output_map_reduce_23_a
    + "\n"
    + output_map_reduce_23_b
    + "\n"
    + output_map_reduce_23_c
)

In [196]:
with open(Path(SUMMARY_SAVE_DIR, "2023_sections_summary.txt"), "w") as f:
    f.write(sections_2023_summary)