# Plan

1. Use GPT to summarise full doc with map-reduce method
2. Use Davinci to summarise full doc with map-reduce method
3. Use GPT to summarise sections by title
4. Try Vector Summarisation

In [1]:
from pathlib import Path
from dotenv import load_dotenv
from langchain.document_loaders import Docx2txtLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from typing import Optional
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from typing import Optional, Union
from langchain.schema import Document

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import plotly.graph_objects as go
from langchain.chains.summarize import load_summarize_chain

# Data Science
import numpy as np
from sklearn.cluster import KMeans

In [2]:
load_dotenv()

True

# Load Documents

In [3]:
SUMMARY_SAVE_DIR = Path("data", "summaries")

In [4]:
DATA_DIR = Path("data", "raw")
toc_2015_fname = Path(DATA_DIR, "Jan 2015.docx")
toc_2023_fname = Path(DATA_DIR, "Mar 2023.docx")

In [5]:
loader_2015 = Docx2txtLoader(str(toc_2015_fname))  # str reqd for loader
data_2015 = loader_2015.load()
loader_2023 = Docx2txtLoader(str(toc_2023_fname))
data_2023 = loader_2023.load()

# Map reduce template

In [14]:
# Map
map_template = """
Write a concise summary of the following:
"{docs}"
CONCISE SUMMARY:
"""
map_prompt = PromptTemplate.from_template(map_template)
# Reduce
reduce_template = """Write a concise summary of the following text delimited by triple backquotes.
Return your response in bullet points which covers the key points of the text.
```{doc_summaries}```
BULLET POINT SUMMARY:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Whole Document - GPT

In [7]:
gpt4 = ChatOpenAI(model_name="gpt-4")
# gpt4 has up to 8,192 tokens

In [8]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4000, chunk_overlap=0
)
split_2015 = text_splitter.split_documents(data_2015)
split_2023 = text_splitter.split_documents(data_2023)

token_lengths_2015 = [
    gpt4.get_num_tokens(split_2015[i].page_content) for i in range(len(split_2015))
]

token_lengths_2023 = [
    gpt4.get_num_tokens(split_2023[i].page_content) for i in range(len(split_2023))
]
print(
    f"2015 T&Cs has been split into {len(split_2015)} docs. Made up of {token_lengths_2015} length tokens."
)
print(
    f"2023 T&Cs has been split into {len(split_2023)} docs.Made up of {token_lengths_2023} length tokens."
)

2015 T&Cs has been split into 5 docs. Made up of [3693, 3626, 3732, 3771, 3738] length tokens.
2023 T&Cs has been split into 3 docs.Made up of [3601, 3479, 3199] length tokens.


## Map Reduce Chain

In [9]:
map_chain = LLMChain(llm=gpt4, prompt=map_prompt)

# Run chain

reduce_chain = LLMChain(llm=gpt4, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [11]:
RUN_MAP_REDUCE_GPT = False  # Flag to reduce running time + $$
if RUN_MAP_REDUCE_GPT:
    output_map_reduce_15 = map_reduce_chain.run(split_2015)
    output_map_reduce_23 = map_reduce_chain.run(split_2023)

In [21]:
SAVE = False
if SAVE:
    with open(Path(SUMMARY_SAVE_DIR, "gpt4_map_reduce_summarized_2015.txt"), "w") as f:
        f.write(output_map_reduce_15)
    with open(Path(SUMMARY_SAVE_DIR, "gpt4_map_reduce_summarized_2023.txt"), "w") as f:
        f.write(output_map_reduce_23)

In [13]:
if RUN_MAP_REDUCE_GPT:
    print(f"2015 map-reduce summary contains {len(output_map_reduce_15.split())} words")
    print(f"2023 map-reduce summary contains {len(output_map_reduce_23.split())} words")

# Whole Document - Davinci

In [14]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=2000, chunk_overlap=0
)
split_2015 = text_splitter.split_documents(data_2015)
split_2023 = text_splitter.split_documents(data_2023)

In [15]:
davinci = OpenAI(temperature=0, model="text-davinci-003", max_tokens=1000)

### Map Reduce Chain

In [16]:
map_chain = LLMChain(llm=davinci, prompt=map_prompt)
reduce_chain = LLMChain(llm=davinci, prompt=reduce_prompt)

In [19]:
RUN_MAP_REDUCE_DAVINCI = False  # Flag to reduce running time + $$
if RUN_MAP_REDUCE_DAVINCI:
    output_map_reduce_davinci_15 = map_reduce_chain.run(split_2015)
    output_map_reduce_davinci_23 = map_reduce_chain.run(split_2023)

In [22]:
if SAVE:
    with open(
        Path(SUMMARY_SAVE_DIR, "davinci_map_reduce_summarized_2015.txt"), "w"
    ) as f:
        f.write(output_map_reduce_davinci_15)
    with open(
        Path(SUMMARY_SAVE_DIR, "davinci_map_reduce_summarized_2023.txt"), "w"
    ) as f:
        f.write(output_map_reduce_davinci_23)

In [23]:
if RUN_MAP_REDUCE_DAVINCI:
    print(
        f"2015 davinci map-reduce summary contains {len(output_map_reduce_davinci_15.split())} words"
    )
    print(
        f"2023 davinci map-reduce summary contains {len(output_map_reduce_davinci_23.split())} words"
    )

# Summarising by section

Manually splitting documents into sections that make sense to a human, before summarising. Hope is that more of the key information will be maintained, however this would be unsustainable for very long corpora. 

## Splitting docs by sections

In [24]:
raw_text_2015 = data_2015[0].page_content
raw_text_2023 = data_2023[0].page_content

In [25]:
print(f"2015 T&S contain {len(raw_text_2015.split())} words")
print(f"2023 T&S contain {len(raw_text_2023.split())} words")

2015 T&S contain 15399 words
2023 T&S contain 8386 words


## GPT

### 2015 Sections

In [26]:
delimiters_2015 = [
    "A. ITUNES STORE, MAC APP STORE, APP STORE AND IBOOKS STORE TERMS OF SALE",
    "B. ITUNES STORE TERMS AND CONDITIONS",
    "C. MAC APP STORE, APP STORE AND IBOOKS STORE TERMS AND CONDITIONS",
]

In [27]:
raw_text_2015 = raw_text_2015[
    200:
]  # removing the initial text as this matches the delimiters

In [28]:
def string_between_substrings(input_str: str, start: str, end: Optional[str]):
    if end is None:
        return (input_str.split(start))[1]
    return (input_str.split(start))[1].split(end)[0]

In [29]:
a_section = string_between_substrings(
    raw_text_2015, delimiters_2015[0], delimiters_2015[1]
)

b_section = string_between_substrings(
    raw_text_2015, delimiters_2015[1], delimiters_2015[2]
)

c_section = string_between_substrings(raw_text_2015, delimiters_2015[2], None)

In [30]:
print(
    f"""A Section length: {len(a_section.split())} words,
      B Section length: {len(b_section.split())} words,
      C Section length: {len(c_section.split())} words """
)

A Section length: 2280 words,
      B Section length: 5186 words,
      C Section length: 7767 words 


In [31]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=4000, chunk_overlap=0
)

In [152]:
a_doc_2015 = text_splitter.create_documents([a_section])
b_doc_2015 = text_splitter.create_documents([b_section])
c_doc_2015 = text_splitter.create_documents([c_section])

a_split_2015 = text_splitter.split_documents(a_doc_2015)
b_split_2015 = text_splitter.split_documents(b_doc_2015)
c_split_2015 = text_splitter.split_documents(c_doc_2015)

In [None]:
map_chain = LLMChain(llm=gpt4, prompt=map_prompt)
reduce_chain = LLMChain(llm=gpt4, prompt=reduce_prompt)

In [34]:
RUN_SECTIONS_MAP_REDUCE = False
if RUN_SECTIONS_MAP_REDUCE:
    output_map_reduce_15_a = map_reduce_chain.run(a_split_2015)
    output_map_reduce_15_b = map_reduce_chain.run(b_split_2015)
    output_map_reduce_15_c = map_reduce_chain.run(c_split_2015)

    sections_2015_summary = (
        output_map_reduce_15_a
        + "\n"
        + output_map_reduce_15_b
        + "\n"
        + output_map_reduce_15_c
    )
SAVE_SECTIONS_MAP_REDUCE = False
if RUN_SECTIONS_MAP_REDUCE and SAVE_SECTIONS_MAP_REDUCE:
    with open(Path(SUMMARY_SAVE_DIR, "2015_sections_summary.txt"), "w") as f:
        f.write(sections_2015_summary)

### 2023 Sections

In [35]:
delimiters_2023 = [
    "A. INTRODUCTION TO OUR SERVICES",
    # "B. USING OUR SERVICES",
    "C. YOUR SUBMISSIONS TO OUR SERVICES",
    # "D. FAMILY SHARING",
    # "E. SERIES PASS AND MULTI-PASS",
    # "F. ADDITIONAL APP STORE TERMS (EXCLUDING APPLE ARCADE APPS)",
    "G. ADDITIONAL TERMS FOR CONTENT ACQUIRED FROM THIRD PARTIES",
    # "H. ADDITIONAL APPLE MUSIC TERMS",
    # "I. ADDITIONAL APPLE FITNESS+ TERMS",
    # "J. CARRIER MEMBERSHIP",
    # "K. MISCELLANEOUS TERMS APPLICABLE TO ALL SERVICES",
]
# I've manually split these into lengths of around 2500 - 3000 words

In [36]:
delimiters_2023

['A. INTRODUCTION TO OUR SERVICES',
 'C. YOUR SUBMISSIONS TO OUR SERVICES',
 'G. ADDITIONAL TERMS FOR CONTENT ACQUIRED FROM THIRD PARTIES']

In [37]:
a_section_23 = string_between_substrings(
    raw_text_2023, delimiters_2023[0], delimiters_2023[1]
)

b_section_23 = string_between_substrings(
    raw_text_2023, delimiters_2023[1], delimiters_2023[2]
)

c_section_23 = string_between_substrings(raw_text_2023, delimiters_2023[2], None)

In [38]:
print(
    f"""A Section length: {len(a_section_23.split())} words,
      B Section length: {len(b_section_23.split())} words,
      C Section length: {len(c_section_23.split())} words """
)

A Section length: 2654 words,
      B Section length: 2996 words,
      C Section length: 2692 words 


In [39]:
a_doc_2023 = text_splitter.create_documents([a_section_23])
b_doc_2023 = text_splitter.create_documents([b_section_23])
c_doc_2023 = text_splitter.create_documents([c_section_23])

a_split_2023 = text_splitter.split_documents(a_doc_2023)
b_split_2023 = text_splitter.split_documents(b_doc_2023)
c_split_2023 = text_splitter.split_documents(c_doc_2023)

In [41]:
if RUN_SECTIONS_MAP_REDUCE:
    output_map_reduce_23_a = map_reduce_chain.run(a_split_2023)
    output_map_reduce_23_b = map_reduce_chain.run(b_split_2023)
    output_map_reduce_23_c = map_reduce_chain.run(c_split_2023)

    sections_2023_summary = (
        output_map_reduce_23_a
        + "\n"
        + output_map_reduce_23_b
        + "\n"
        + output_map_reduce_23_c
    )

In [42]:
if RUN_SECTIONS_MAP_REDUCE and SAVE_SECTIONS_MAP_REDUCE:
    with open(Path(SUMMARY_SAVE_DIR, "2023_sections_summary.txt"), "w") as f:
        f.write(sections_2023_summary)

# Vectors

Interesting approach to summarizing large documents, inspired from this blog post:

https://pashpashpash.substack.com/p/tackling-the-challenge-of-document

In [7]:
def summarize_with_vectors(
    model_name: str,
    document_split: list[Document],
    map_prompt: str,
    reduce_prompt: str,
    num_clusters: int,
    save_dir: Optional[Union[Path, str]] = None,
):
    llm = ChatOpenAI(model_name=model_name)
    embeddings = OpenAIEmbeddings()
    vectors = np.array(
        embeddings.embed_documents([x.page_content for x in document_split])
    )
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init="auto").fit(
        vectors
    )
    # Find the closest embeddings to the centroids

    closest_indices = []

    for i in range(num_clusters):
        # Get the list of distances from that particular cluster center
        distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)

        # Find the list position of the closest one (using argmin to find the smallest distance)
        closest_index = np.argmin(distances)

        # Append that position to your closest indices list
        closest_indices.append(closest_index)
    selected_indices = sorted(closest_indices)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    selected_docs = [document_split[doc] for doc in selected_indices]
    # Make an empty list to hold your summaries
    summary_list = []

    # Loop through a range of the lenght of your selected docs
    for i, doc in enumerate(selected_docs):
        # Go get a summary of the chunk
        chunk_summary = map_chain.run([doc])

        # Append that summary to your list
        summary_list.append(chunk_summary)

    summaries = "\n".join(summary_list)

    # Convert it back to a document
    summaries = Document(page_content=summaries)
    output = reduce_chain.run([summaries])
    if save_dir is not None:
        with open(save_dir, "w") as f:
            f.write(output)
        return output
    return output

In [8]:
def plot_tsne(
    document_split: list[Document],
    num_clusters: int,
    perplexity: int,
    plot_kwargs: dict,
    save_dir: Optional[Union[Path, str]] = None,
):
    colours = [
        "maroon",
        "purple",
        "green",
        "blue",
        "yellow",
        "black",
        "aqua",
        "coral",
        "darkblue",
        "darkgreen",
        "darkmagenta",
        "darkslateblue",
        "deeppink",
        "dimgrey",
        "indianred",
        "mediumslateblue",
        "olivedrab",
        "orangered",
        "palevioletred",
    ]
    colour_map = {k: v for k, v in enumerate(colours)}

    embeddings = OpenAIEmbeddings()
    vectors = np.array(
        embeddings.embed_documents([x.page_content for x in document_split])
    )
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    reduced_data_tsne = tsne.fit_transform(vectors)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init="auto").fit(
        vectors
    )
    kmeans_colours = []
    for label in kmeans.labels_:
        kmeans_colours.append(colour_map[label])
    fig = go.Figure(
        data=go.Scatter(
            x=reduced_data_tsne[:, 0],
            y=reduced_data_tsne[:, 1],
            marker_color=kmeans_colours,
            hovertext=kmeans.labels_,
            mode="markers",
            name="TSNE Cluster",
        )
    )
    fig.update_layout(
        template="simple_white",
        title=plot_kwargs["title"],
        xaxis_title="TSNE 1",
        yaxis_title="TSNE 2",
    )
    if save_dir is not None:
        if save_dir.suffix != "":
            save_dir = Path(
                save_dir.parent, save_dir.stem
            )  # Removing suffix if existing
        save_html = str(save_dir) + ".html"
        save_png = str(save_dir) + ".png"
        with open(save_html, "w") as f:
            f.write(fig.to_html())
        fig.write_image(save_png)
    fig.show()

## 2015

In [9]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=750, chunk_overlap=50
)
split_2015 = text_splitter.split_documents(data_2015)

In [10]:
print(len(split_2015))

30


In [11]:
tsne_2015_fname = Path("figures", "tsne_2015_fig.html")
plot_kwargs_2015 = {"title": "2015 Contract TSNE Plot"}
plot_tsne(
    split_2015,
    num_clusters=10,
    perplexity=20,
    plot_kwargs=plot_kwargs_2015,
    save_dir=tsne_2015_fname,
)

In [15]:
vector_2015_summary_fname = Path("data", "summaries", "vector_2015_summary.txt")
if not vector_2015_summary_fname.exists():
    summarize_with_vectors(
        model_name="gpt-3.5-turbo",
        document_split=split_2015,
        map_prompt=map_prompt,
        reduce_prompt=reduce_prompt,
        num_clusters=5,
        save_dir=vector_2015_summary_fname,
    )

### 2023

In [16]:
split_2023 = text_splitter.split_documents(data_2023)

In [17]:
print(len(split_2023))

17


In [19]:
tsne_2023_fname = Path("figures", "tsne_2023_fig.html")
plot_kwargs_2023 = {"title": "2023 Contract TSNE Plot"}
plot_tsne(
    split_2023,
    num_clusters=10,
    perplexity=15,
    plot_kwargs=plot_kwargs_2023,
    save_dir=tsne_2023_fname,
)

In [20]:
vector_2023_summary_fname = Path("data", "summaries", "vector_2023_summary.txt")
if not vector_2023_summary_fname.exists():
    summarize_with_vectors(
        model_name="gpt-3.5-turbo",
        document_split=split_2023,
        map_prompt=map_prompt,
        reduce_prompt=reduce_prompt,
        num_clusters=5,
        save_dir=vector_2023_summary_fname,
    )