In [8]:
import os
import ast
import json
import warnings
import pandas as pd
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
encoder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")



### Load and Clean Data

In [9]:
df = pd.read_csv("../assets/data/clean/listings.csv", encoding="latin-1")

In [10]:
# Fix garbled text (mojibake) in all string columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].apply(lambda x: x.encode('latin1').decode('utf-8') if isinstance(x, str) else x)

In [11]:
def try_fix_unicode(text):
    if isinstance(text, str) and '\\u' in text:
        try:
            return text.encode('utf-8').decode('unicode_escape')
        except Exception:
            return text  # if decoding fails, return original
    else:
        return text

for col in df.select_dtypes(include='object'):
    df[col] = df[col].apply(try_fix_unicode)

In [12]:
df['unique_id'] = df.apply(lambda row: f"{row.name}_{row['link'].split('/')[-1]}", axis=1)

In [15]:
df.location

0       ['Bogotá', 'Chapinero', 'Santa Fé', 'Cundinama...
1                  ['Bogotá', 'Fontibón', 'Cundinamarca']
2       ['Usaquén', 'Bogotá', 'Cundinamarca', 'Suba', ...
3       ['Kennedy', 'Bogotá', 'Cundinamarca', 'Avenida...
4       ['Bogotá', 'Chapinero', 'Cundinamarca', 'Barri...
                              ...                        
1183                                ['Usaquén', 'Bogotá']
1184              ['Bogotá', 'Chapinero', 'Cundinamarca']
1185    ['Bogotá', 'Chapinero', 'Santa Fé', 'Teusaquil...
1186       ['Suba', 'Bogotá', 'Cundinamarca', 'Turingia']
1187    ['Puente Aranda', 'Teusaquillo', 'Centro comer...
Name: location, Length: 1188, dtype: object

In [19]:
df = df.dropna()

In [20]:

df["location"] = df["location"].apply(ast.literal_eval)


In [21]:
df.location[4]

['Bogotá', 'Chapinero', 'Cundinamarca', 'Barrios Unidos']

In [22]:
unique_locations = set(loc for sublist in df["location"].dropna() for loc in sublist)


In [24]:
locations = list(unique_locations)  

In [26]:
sorted(locations)

['1 De Mayo',
 '25',
 'AV. Centenario - KR 87',
 'Ac. 26',
 'Ak 7',
 'Ak 9',
 'Ak.',
 'Altagracia',
 'Alto',
 'Altos del Country',
 'Amazonas',
 'Antonio Nariño',
 'Aranjuez',
 'Autopista Sur',
 'Av. Boyacá',
 'Ave Cra 30',
 'Avenida Carrera 22',
 'Avenida de las Américas',
 'BOGOTÁ',
 'Bajando por LA CARMELITA Colinas de Suba',
 'Barrio Ricaurte',
 'Barrios Unidos',
 'Bella Flor',
 'Bella Suiza',
 'Bilbao',
 'Bogot\x1a',
 'Bogota',
 'Bogota D.C.',
 'Bogotá',
 'Bogotá D.C',
 'Bogotá D.C.',
 'Bogotá, Colombia',
 'Bogotá, Cundinamarca',
 'Bogotá, DC',
 'Bolívar',
 'Boogotá',
 'Bosa',
 'Boyacá',
 'Buenos Aires',
 'CC Unicentro de Occidente',
 'CEDRITOS',
 'Calle 181',
 'Calvo sur',
 'Candelaria',
 'Carrera 58',
 'Castilla',
 'Cdad. Bolívar',
 'Cdad. Bolívar, Bogotá, Colombia',
 'Cdad. Bolívar, Bogotá, Cundinamarca, Colombia',
 'Cedritos',
 'Central',
 'Centro',
 'Centro Comercial',
 'Centro Comercial Bacatá',
 'Centro Comercial Palatino',
 'Centro Comercial Santafé',
 'Centro Empresarial 

In [6]:
df.isna().sum()

link                    0
price                   0
bedrooms                0
bathrooms               0
area                    0
agency                  0
coordinates             0
facilities              0
upload_date             0
stratum                 0
parking_lots            0
floor                   0
construction_age_min    0
construction_age_max    0
description             0
places                  0
location                0
transportation          0
dtype: int64

In [7]:
df.dtypes

link                            object
price                            int64
bedrooms                         int64
bathrooms                        int64
area                             int64
agency                          object
coordinates                     object
facilities                      object
upload_date             datetime64[ns]
stratum                          int64
parking_lots                     int64
floor                            int64
construction_age_min             int64
construction_age_max             int64
description                     object
places                          object
location                        object
transportation                  object
dtype: object

### Create Collection

In [8]:
client.create_collection(
    collection_name="apartments",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)

True

In [9]:
# Document class to structure data
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

# Convert DataFrame rows into Document objects
def df_to_documents(df):
    documents = []
    for _, row in df.iterrows():
        metadata = {
            "link": row["link"],
            "price": row["price"],
            "bedrooms": row["bedrooms"],
            "bathrooms": row["bathrooms"],
            "area": row["area"],
            "agency": row["agency"],
            "coordinates": row["coordinates"],
            "facilities": row["facilities"],
            "upload_date": row["upload_date"],
            "stratum": row["stratum"],
            "parking_lots": row["parking_lots"],
            "floor": row["floor"],
            "construction_age_min": row["construction_age_min"],
            "construction_age_max": row["construction_age_max"],
            "places": row["places"],
            "location": row["location"],
            "transportation": row["transportation"],
        }
        document = Document(page_content=row["description"], metadata=metadata)
        documents.append(document)
    return documents

docs = df_to_documents(df)

In [10]:
points = [
    models.PointStruct(
        id=idx, 
        vector=encoder.encode(doc.page_content).tolist(), 
        payload={'metadata': doc.metadata, 'page_content': doc.page_content}
    )
    for idx, doc in enumerate(docs)
]

In [11]:
client.upload_points(
    collection_name="apartments",
    points=points,
)

In [12]:
# query filter
hits = client.search(
    collection_name="apartments",
    query_vector=encoder.encode("Apartamento moderno y minimalista con vista exterior").tolist(),
    query_filter=models.Filter(
        must=[
            models.FieldCondition(key="metadata.agency", match=models.MatchValue(value="Houm")),
            models.FieldCondition(key="metadata.price", range=models.Range(gte=2500000, lte=3500000)), 
            models.FieldCondition(key="metadata.bedrooms", range=models.Range(gte=2, lte=3))
        ]
    ),
    limit=10,
)

for hit in hits:
    print(hit.payload['metadata']['link'], "\nprice:", hit.payload['metadata']['price'], "\agency:", hit.payload['metadata']['agency'], "\abedrooms:", hit.payload['metadata']['bedrooms'],"\n\n")