# Neighborhood Description Generation

Because is a requirement we did it with Langchain and gpt3.5, we generate some descriptions of the site programatically and used async generation to speed it.

In [None]:
!pip install -U langchain-openai python-dotenv

In [1]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv,find_dotenv
import pandas as pd
import numpy as np
import asyncio
import glob
import os

Let's test first if we can load the documents and display some unique values of the neighborhoods to generate

In [2]:
df = pd.concat([pd.read_excel(doc) for doc in glob.glob('documents/*')])

In [3]:
neighborhoods = df.reset_index(drop=True).neighborhood.unique().tolist()

In [4]:
np.random.choice(neighborhoods, 3).tolist()

['Guarumal', 'El Bosque', 'Betania']

Now let's play with the GPT and try to make a promp format that we can reuse

In [5]:
load_dotenv(find_dotenv())
kwargs = dict(model='gpt-3.5-turbo', temperature=0.7, max_tokens=300)
chat_llm = ChatOpenAI(**kwargs)

In [6]:
template = """Describe como es {neighborhood} en Panamá como vecindad para vivir, resume y lista con nombres de sitios \
los más prominente para fiestas, hospitales cercanos, accesos a autopista, parques, supermercados, centros deportivos y \
gimnasio, no mas de 200 palabras."""
prompt = PromptTemplate(input_variables=['neighborhood'], template=template)

In [7]:
prompt_format = prompt.format(neighborhood='San Carlos')
prompt_format

'Describe como es San Carlos en Panamá como vecindad para vivir, resume y lista con nombres de sitios los más prominente para fiestas, hospitales cercanos, accesos a autopista, parques, supermercados, centros deportivos y gimnasio, no mas de 200 palabras.'

In [8]:
messages = [
    SystemMessage(content="Eres un vendedor de bienes raices, tu trabajo es encantar con palabras y demostrar las bondades de los sitios\
    no solo como experiencia de vida, sino un lugar para establecerse."),
    HumanMessage(content=prompt.format(neighborhood=neighborhoods[2]))
]

Test our async generation, if you try to implement it using invoke your async routine will be blocked that's why we use agenerate

In [11]:
print("=== Query ===")
print(' '.join([msg.content for msg in messages]))
print("=== Chat Response ===")
#response = chat_llm.invoke(messages)
response = await chat_llm.agenerate([messages])
print(response.generations[0][0].text)

=== Query ===
Eres un vendedor de bienes raices, tu trabajo es encantar con palabras y demostrar las bondades de los sitios    no solo como experiencia de vida, sino un lugar para establecerse. Describe como es Bella Vista en Panamá como vecindad para vivir, resume y lista con nombres de sitios los más prominente para fiestas, hospitales cercanos, accesos a autopista, parques, supermercados, centros deportivos y gimnasio, no mas de 200 palabras.
=== Chat Response ===
Bella Vista en Panamá es una vecindad vibrante y emocionante para vivir, ideal para aquellos que buscan comodidad y conveniencia. Con una ubicación céntrica y una amplia gama de servicios cercanos, es el lugar perfecto para establecerse.

Para aquellos que disfrutan de la vida nocturna, Bella Vista ofrece una variedad de opciones para fiestas, como bares y discotecas como El Sótano, Mojitos sin Mojitos y La Rana Dorada. Los hospitales cercanos incluyen el Hospital Nacional y el Hospital Santo Tomás, garantizando atención m

Let's now test an async generation routine to understand the behaviour

In [12]:
async def execute_coroutine(n):
    print(f'Multiprocessing Part {n:02}')
    await asyncio.sleep(np.random.randint(10)) # here the llm response
    print(f'Finished Part {n:02}')
    return np.random.randint(10)

In [13]:
batch = np.unique(np.random.randint(0, 100, size=(20,)))
batch

array([ 6, 10, 11, 19, 22, 34, 44, 46, 64, 65, 67, 72, 73, 75, 78, 86, 91,
       94, 99])

In [15]:
BATCH_SIZE=5
results = list()
for mini_batch in range(0, len(batch), BATCH_SIZE):
    start = mini_batch
    end   = start + BATCH_SIZE
    values = batch[start:end]
    coroutines = [asyncio.create_task(execute_coroutine(i)) for i in values]
    done, pending = await asyncio.wait(coroutines)
    values = await asyncio.gather(*coroutines)
    results.extend(values)

Multiprocessing Part 06
Multiprocessing Part 10
Multiprocessing Part 11
Multiprocessing Part 19
Multiprocessing Part 22
Finished Part 19
Finished Part 10
Finished Part 22
Finished Part 06
Finished Part 11
Multiprocessing Part 34
Multiprocessing Part 44
Multiprocessing Part 46
Multiprocessing Part 64
Multiprocessing Part 65
Finished Part 46
Finished Part 64
Finished Part 44
Finished Part 34
Finished Part 65
Multiprocessing Part 67
Multiprocessing Part 72
Multiprocessing Part 73
Multiprocessing Part 75
Multiprocessing Part 78
Finished Part 72
Finished Part 67
Finished Part 75
Finished Part 73
Finished Part 78
Multiprocessing Part 86
Multiprocessing Part 91
Multiprocessing Part 94
Multiprocessing Part 99
Finished Part 91
Finished Part 86
Finished Part 94
Finished Part 99


In [16]:
results, len(results), len(batch)

([1, 8, 2, 1, 6, 0, 5, 3, 9, 8, 1, 4, 5, 1, 2, 8, 0, 1, 0], 19, 19)

In [17]:
async def invoke_llm(neighborhood):
    print(f'Multiprocessing Neighborhood {neighborhood}')
    messages = [
        SystemMessage(content="Eres un vendedor de bienes raices, tu trabajo es encantar con palabras y demostrar las bondades de los sitios\
        no solo como experiencia de vida, sino un lugar para establecerse."),
        HumanMessage(content=prompt.format(neighborhood=neighborhood))
    ]
    response = await chat_llm.agenerate([messages])
    print(f'{neighborhood} Multiprocessing Finished')
    return response.generations[0][0].text

In [18]:
BATCH_SIZE=2
results = list()
for mini_batch in range(0, len(neighborhoods[:7]), BATCH_SIZE):
    start = mini_batch
    end   = start + BATCH_SIZE
    neighborhoods_mini_batch = neighborhoods[start:end] 
    coroutines = [asyncio.create_task(invoke_llm(neighborhood)) for neighborhood in neighborhoods_mini_batch]
    values = await asyncio.gather(*coroutines)
    results.extend(values)

Multiprocessing Neighborhood San Francisco
Multiprocessing Neighborhood Punta Chame
San Francisco Multiprocessing Finished
Punta Chame Multiprocessing Finished
Multiprocessing Neighborhood Bella Vista
Multiprocessing Neighborhood Santa María
Bella Vista Multiprocessing Finished
Santa María Multiprocessing Finished
Multiprocessing Neighborhood Avenida Balboa
Multiprocessing Neighborhood Costa Del Este
Costa Del Este Multiprocessing Finished
Avenida Balboa Multiprocessing Finished
Multiprocessing Neighborhood Parque Lefevre
Multiprocessing Neighborhood Punta Paitilla
Parque Lefevre Multiprocessing Finished
Punta Paitilla Multiprocessing Finished


In [22]:
pd.DataFrame(dict(neibhorhood=neighborhoods[:8], neighborhood_description=results))

Unnamed: 0,neibhorhood,neighborhood_description
0,San Francisco,San Francisco en Panamá es el lugar ideal para...
1,Punta Chame,Punta Chame en Panamá es un destino paradisíac...
2,Bella Vista,Bella Vista en Panamá es una vecindad vibrante...
3,Santa María,Santa María en Panamá es una vecindad exclusiv...
4,Avenida Balboa,"¡Bienvenido a Avenida Balboa, el lugar ideal p..."
5,Costa Del Este,Costa Del Este en Panamá es una vecindad exclu...
6,Parque Lefevre,Parque Lefevre en Panamá es un vecindario vibr...
7,Punta Paitilla,Punta Paitilla en Panamá es una vecindad exclu...


Now lest's make a class to build everything in an ordered way an execute

In [23]:
%%writefile build_neighborhood_description.py
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv,find_dotenv
import logging
import pandas as pd
import numpy as np
import argparse
import asyncio
import glob
import sys
import os

logging.basicConfig()
logger = logging.getLogger(__name__)
#logger.addHandler(logging.StreamHandler(sys.stdout))
load_dotenv(find_dotenv())

class GPTDescriptor:
    def __init__(self, documents_dir, input_format, model='gpt-3.5-turbo', temperature=0.7, max_tokens=300):
        self.documents_dir = documents_dir
        self.input_format  = input_format
        kwargs = dict(model=model, temperature=temperature, max_tokens=max_tokens)
        self.chat_llm = ChatOpenAI(**kwargs)
        self.prompt = self.build_prompt(input_variables=['neighborhood'])

    def list_files(self):
        return glob.glob(f'{self.documents_dir}/*.{self.input_format}')

    def read_documents(self, documents):
        if self.input_format == 'xlsx':
            return pd.concat([pd.read_excel(doc) for doc in documents])
        if self.input_format == 'csv':
            return pd.concat([pd.read_csv(doc, header=0) for doc in documents])        
        if self.input_format == 'parquet':
            return pd.concat([pd.read_parquet(doc) for doc in documents])

    def build_prompt(self, input_variables):
        template = """Describe como es {neighborhood} en Panamá como vecindad para vivir, resume y lista con nombres de sitios\
        los más prominente para fiestas, hospitales cercanos, accesos a autopista, parques, supermercados, centros deportivos y\
        gimnasio, no mas de 200 palabras."""        
        prompt = PromptTemplate(input_variables=input_variables, template=template)
        return prompt

    def build_message(self, neighborhood):
        messages = [
            SystemMessage(content="Eres un vendedor de bienes raices, tu trabajo es encantar con palabras y demostrar las bondades de los sitios\
            no solo como experiencia de vida, sino un lugar para establecerse."),
            HumanMessage(content=self.prompt.format(neighborhood=neighborhood))
        ]
        return messages

    async def invoke_llm(self, neighborhood):
        logging.info(f'Muliprocessing {neighborhood} started')
        messages = self.build_message(neighborhood=neighborhood)
        response = await self.chat_llm.agenerate([messages])
        logging.info(f'{neighborhood} Multiprocessing Finished')
        return response.generations[0][0].text

    def compile_and_save(self, neighborhood_dict, path, output_format='csv'):
        if output_format=='xlsx':
            pd.DataFrame(neighborhood_dict).to_excel(path, index=False, sheet_name='HomeMatch')
        if output_format=='csv':
            pd.DataFrame(neighborhood_dict).to_csv(path, header=True, index=False)
        if output_format=='parquet':
            pd.DataFrame(neighborhood_dict, axis=1).to_parquet(path, index=False)

import time 

async def main(args):
    documents_dir, output_folder = args.documents_dir, args.output_folder
    input_format, output_format, batch_size = args.input_format, args.output_format, args.batch_size
    model, temperature, max_tokens = args.model, args.temperature, args.max_tokens
    gd = GPTDescriptor(documents_dir=documents_dir, input_format=input_format, model=model, temperature=temperature, max_tokens=max_tokens)
    documents_list = gd.list_files()
    df = gd.read_documents(documents_list)
    neighborhoods = df.neighborhood.unique().tolist()
    results = list()
    for start in range(0, len(neighborhoods), batch_size):
        end = start + batch_size
        neighborhoods_mini_batch = neighborhoods[start:end] 
        tasks = [gd.invoke_llm(neighborhood) for neighborhood in neighborhoods_mini_batch]
        values = await asyncio.gather(*tasks)
        results.extend(values)
    neighborhood_dict = dict(neighborhood=neighborhoods, neighborhood_description=results)
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)
    path = os.path.join(output_folder, f'neighborhood_descriptions.{output_format}')
    gd.compile_and_save(neighborhood_dict=neighborhood_dict, path=path, output_format=output_format)


if __name__ == "__main__" :
    parser = argparse.ArgumentParser(description='builds a dataframe of the webpages selected')
    parser.add_argument('-d', '--documents-dir', type=str, default='documents')
    parser.add_argument('-o', '--output-folder', type=str, default='description')
    parser.add_argument('-if', '--input-format', type=str, default='csv')
    parser.add_argument('-of', '--output-format', type=str, default='csv')    
    parser.add_argument('-b', '--batch-size', type=int, default=5)
    parser.add_argument('-m', '--model', type=str, default='gpt-3.5-turbo')
    parser.add_argument('-t', '--temperature', type=float, default=0.7)
    parser.add_argument('-k', '--max-tokens', type=int, default=200) 

    args = parser.parse_args()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(args))

Overwriting build_neighborhood_description.py


In [None]:
!python build_neighborhood_description.py -d documents -o description -if xlsx -of xlsx -b 5