In [1]:
# Set up vector store
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings

persistent_client = chromadb.PersistentClient()

collection = persistent_client.get_or_create_collection("embeddings_test")
display(collection.count())
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

2908

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# READ PDFs
import spacy
import re
import os
import uuid

from langchain.text_splitter import RecursiveCharacterTextSplitter

# nlp = spacy.load("en_core_web_sm")
# pdfs = os.listdir('pdfs')
# pdfs = sorted(pdfs)

pdfs = ['London_Borough_of_Southwark',
        'London_Borough_of_Tower_Hamlets', 'London_Borough_of_Islington']


for pdf in pdfs[0:3]:
    with open('txts/' + pdf + '.txt') as f:
        string = f.read()
        cleaner_string = string.replace('\n', ' ').replace('\r', '')

        clean_string = re.sub("\s\s+", " ", cleaner_string)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            length_function=len,
            add_start_index=True,
        )

        # Split text
        split_texts = text_splitter.create_documents([string])
        split_texts_list = [str(txt.page_content) for txt in split_texts]
        display(len(split_texts_list))

        # Embed text
        embedded_texts = embedding_model.embed_documents(
            texts=split_texts_list)

        # add vectors to collection
        ids = [str(uuid.uuid4()) for sent in split_texts_list]
        metadatas = [{"LPA": pdf}
                     for sent in split_texts_list]
        collection.add(
            embeddings=embedded_texts,
            documents=split_texts_list,
            ids=ids,
            metadatas=metadatas
        )


len(clean_string)

973

InvalidCollectionException: Collection 69bd7fc7-e8fc-45f7-9ce4-d4a95d83317e does not exist.

In [3]:
display(collection.count())

2908

In [9]:
# Create context string
query = embedding_model.embed_documents(
    ["Bike transport"],
)

southwark_results = collection.query(
    query_embeddings=query,
    where={"LPA": "London_Borough_of_Southwark"},
    n_results=4,
)

southwark_results_string = '\n\n'.join(southwark_results['documents'][0][0:4])

th_results = collection.query(
    query_embeddings=query,
    where={"LPA": "London_Borough_of_Tower_Hamlets"},
    n_results=4,
)
th_results_string = '\n\n'.join(th_results['documents'][0][0:4])


islington_results = collection.query(
    query_embeddings=query,
    where={"LPA": "London_Borough_of_Islington"},
    n_results=4,
)
islington_results_string = '\n\n'.join(islington_results['documents'][0][0:4])


context = '''
Context from Southwark: 
{southwark_results}


Context from Tower Hamlets: 
{th_results}


Context from Islington: 
{islington_results}


'''.format(southwark_results=southwark_results_string, th_results=th_results_string, islington_results=islington_results_string)

print(context)


Context from Southwark: 
and get access to health services, all within walking distance and very convenient for cycling. This will supportouraim to be an age friendly borough. These policies for improving town centres, building schools,and
providing the facilities for cycling and walking will address physical and mental health issuesto improve the
every day experiences of residents, workers and shoppers. We will also encourage permanent and temporary
community food growing opportunities, improve the quality of green spaces and parks, and ensure residents
have access to opportunities for free swimming and gym use, an extended bike hire and cycle network.
2. T he shift in transport modes from cars to walking and cycling responds to the Climate Emergency declared by
Southwark by reducing car use and improving the health of local residents.
3. Health inequalities ar e avoidable, unfair and systematic differences in health between different groups of

existing and permitted development; an

In [11]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

query_string = '''
Compare the way Southwark, Tower Hamlets and Islington approach to Bike transport

{context} 
'''. format(context=context)

result = llm.call_as_llm(query_string)

print('-------')

print(result)

-------
The three London boroughs of Southwark, Tower Hamlets, and Islington each have their own approaches to promoting bike transport and cycling infrastructure, reflecting their commitment to sustainable transport and the reduction of car dependency. Here's a comparison of their approaches based on the provided contexts:

**Southwark:**
- Southwark's approach emphasizes the creation of age-friendly environments, with a focus on improving town centers and providing facilities for cycling and walking.
- The borough aims to shift transport modes from cars to walking and cycling in response to the Climate Emergency, aiming to reduce car use and improve residents' health.
- Southwark's policies aim to address health inequalities by improving everyday experiences through better access to green spaces, free swimming, gym use, and an expanded bike hire and cycle network.
- The borough plans to improve accessibility to public transport and enhance walking and cycling connections to public tr

In [None]:
from langchain.chat_models import ChatOpenAI

topic = "Driving, transport, cycling"

query = embedding_model.embed_documents(
    [topic]
)
results = collection.query(
    query_embeddings=query,
    n_results=10,
    
)

context = '\n\n'.join(results['documents'][0][0:10])

print(context)


Context: {context} 
Answer:'''.format(question=question, context=context)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

result = llm.call_as_llm(query_string)

print('-------')

print(result)

In [8]:

query_string = '''You are a driver, campaigning for better driving infrastructure. What do you think of the following context?'''

query_string = '''
You are a cyclist, campaigning for better cycling infrastructure. 

A driver has stated: 
{result}

Considering context: 
{context}

What would you respond?


'''. format(result=result, context=context)


llm.call_as_llm(query_string)

NameError: name 'result' is not defined

In [62]:
query_string = '''
You are a driver, campaigning for better driver infrastructure. 

A cyclist has stated: 
{result}

Considering context: 
{context}

What would you respond?


'''. format(result=result, context=context)


llm.call_as_llm(query_string)

'As a driver campaigning for better driving infrastructure, I understand the importance of promoting sustainable travel options such as cycling, walking, and public transport. These modes of transportation have numerous benefits for the health and well-being of residents, employees, and visitors. However, it is crucial to ensure that the needs of drivers are also considered and improvements to driving infrastructure are included in the plans.\n\nWhile prioritizing the needs of pedestrians and cyclists and integrating development with public transport are important goals, it is essential to strike a balance that also addresses the needs of drivers. Measures that reduce road space and potentially inconvenience drivers should be carefully evaluated to ensure they do not create additional problems or hinder the efficient movement of vehicles.\n\nImproving driving infrastructure should be a priority alongside promoting sustainable travel options. This could involve expanding road capacity, 

In [42]:
prompt = hub.pull("rlm/rag-prompt")

print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [43]:
# splitting on markdown
from langchain.text_splitter import MarkdownHeaderTextSplitter

with open('markdown/hackney.md') as f:
    string = f.read()
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    # MD splits
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(string)

    strings = []

    for split in md_header_splits:

        cleaner_string = split.page_content.replace(
            '\n', ' ').replace('\r', '')

        clean_string = re.sub("\s\s+", " ", cleaner_string)
        strings.append(clean_string)

display(strings)

You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Question: X 
Context: Y 
Answer:
