In [4]:
import os
import openai
import numpy as np
import tiktoken
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

openai.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
openai.api_key

'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'

In [7]:
data = pd.read_csv('data/property_details_202209130130.csv')
data.city.value_counts()


city
Lahore       22322
Karachi       5652
Islamabad     1001
Peshawar       103
Name: count, dtype: int64

In [8]:
short_frames = []
for city in data.city.unique():
    if city == "Lahore":
        short_frames.append(data[data.city == city].sample(5000, axis=0))
    else:
        short_frame = data[data.city == city]
        short_frames.append(short_frame)
        
df = pd.concat(short_frames, axis=0)
df.shape

(11756, 26)

In [11]:
df.to_csv("small_df.csv", index=False)

In [4]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.description.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

df = df[~df.description.isna()]
df = df[df.description != ""]
df.shape

NameError: name 'df' is not defined

In [34]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

# df['description_ada_embedding'] = df.description.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('small_df_with_embedding.csv', index=False)

In [3]:
df = pd.read_csv('small_df_with_embedding.csv')
df.description_ada_embedding = df.description_ada_embedding.apply((lambda x: [float(s.strip(' []')) for s in x.split(',')]))
df.head(1)

Unnamed: 0,property_id,location_id,page_url,type_id,_type,price,price_1,location,city_id,city,...,bedrooms,date_added,description,agency_id,_agency,agent_id,_agent,amenities,n_tokens,description_ada_embedding
0,39096217,1447,https://www.zameen.com/Property/dha_defence_dh...,1,House,79800000,PKR7.98 Crore,DHA Defence,1,Lahore,...,6,9/1/2022,1 Kanal Beautifully Designed Modern House For ...,77.0,Khalifa Real Estate,419.0,ASIF,PKR7.98 Crore|Community Features|Community Fea...,415,"[-0.00431676022708416, 0.020778756588697433, -..."


In [6]:
import os
import pinecone
from langchain.vectorstores import Pinecone

# embedding model parameters
embedding_model = "text-embedding-ada-002"

BATCH_SIZE = 100

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV"),
)

index_name = os.getenv("PINECONE_INDEX_NAME")

index = pinecone.Index(index_name=index_name) 

for batch in tqdm(range(0, df.shape[0]//100 +1)):
    batch_df = df.iloc[batch*BATCH_SIZE:min(df.shape[0], (batch+1)*BATCH_SIZE)]
    vectors = [
        {
        'id': str(i),
        'values': row.description_ada_embedding,
        'metadata': {
            'type': row._type,
            'price': row.price,
            'city': row.city,
            'baths': row.baths,
            'beds': row.bedrooms,
            'description': row.description,
        }
        }
        for i, row in batch_df.iterrows()
    ]
    if len(vectors) > 0:
        upsert_response = index.upsert(vectors)
    if upsert_response.failed:
        print(upsert_response.errors)



  from tqdm.autonotebook import tqdm
100%|██████████| 31/31 [00:22<00:00,  1.36it/s]


In [7]:
import pinecone 

embedding_model = "text-embedding-ada-002"


pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV"),
)

index_name = os.getenv("PINECONE_INDEX_NAME")

index = pinecone.Index(index_name=index_name) 

city = 'Lahore'

query = "A house with garden in the suburbs, close to amenities such as shopping, and in a safe area"

embedded_query = openai.Embedding.create(input = [query], model=embedding_model)['data'][0]['embedding']

query_response = index.query(
    top_k=3,
    include_values=False,
    include_metadata=True,
    vector=embedded_query,
    filter={
        "city": {'$eq': city}
    }
)
query_response

{'matches': [{'id': '434',
              'metadata': {'baths': 4.0,
                           'beds': 4.0,
                           'city': 'Lahore',
                           'description': 'A Very Well Maintained Family House '
                                          'Is Out For Sale With All Amenities '
                                          'Available Nearby Within Walking '
                                          'Distance Range.',
                           'price': 19500000.0,
                           'type': 'House'},
              'score': 0.877978384,
              'values': []},
             {'id': '774',
              'metadata': {'baths': 6.0,
                           'beds': 5.0,
                           'city': 'Lahore',
                           'description': '5 Bedrooms6 Bathrooms2 kitchen2 Tv '
                                          'loungesDrawing roomstorecar '
                                          'porchAll Facilities Are Available '
     

In [8]:
for result in query_response["matches"]:
    print(df.loc[int(result["id"])].description, df.loc[int(result["id"])].city, end='\n\n')

Ground floor2 room1 bathroom1 open kitchen1st floor1 room with attach bathroom Lahore

3bad attached bath double kitchen TV lounge Drawing diningTile flooring small tarce small balkni Big car porch  Good wood work Excellent location All facilities nearly door step Lahore

3 Beds Attach Bathrooms 2 Kitchens Small Porch 15 Feet Street 2 Minutes Walking Distance from Salamat Pura Metro Orange Line Train Station Near to Allied School Near to Hospital Lahore



In [9]:
print(df.loc[434].description)

A Very Well Maintained Family House Is Out For Sale With All Amenities Available Nearby Within Walking Distance Range.


In [12]:
query_response["matches"][1]['metadata']["description"]

'5 Bedrooms6 Bathrooms2 kitchen2 Tv loungesDrawing roomstorecar porchAll Facilities Are Available HereSolid constructionVery Beautiful houseFully tile and marblewood working hereCarpeted roadNear by marketNear by schoolNear by orange line metro train'

# Using GPT to improve the quality to write new descriptions+

This prompt worked well: https://platform.openai.com/playground/p/YAiC9hRb8UEdVN7ntgZQ30yg?model=gpt-3.5-turbo

# Using GPT to re-write original description

This prompt worked well: https://platform.openai.com/playground/p/jQ2BaueSU4X2HfCuVcZlf98t?model=gpt-3.5-turbo
(Tell the model it is an english teacher)