In [50]:
import os
import ast
import json
import warnings
import pandas as pd
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
encoder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

client = QdrantClient(
        url="http://localhost:6333"
)

### Load and Clean Data

In [51]:
df = pd.read_csv("../assets/data/listings_clean.csv", encoding="latin-1")

In [52]:
# Fix garbled text (mojibake) in all string columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].apply(lambda x: x.encode('latin1').decode('utf-8') if isinstance(x, str) else x)

In [53]:
def try_fix_unicode(text):
    if isinstance(text, str) and '\\u' in text:
        try:
            return text.encode('utf-8').decode('unicode_escape')
        except Exception:
            return text  # if decoding fails, return original
    else:
        return text

for col in df.select_dtypes(include='object'):
    df[col] = df[col].apply(try_fix_unicode)

In [None]:
df["facilities"] = df["facilities"].apply(ast.literal_eval)
df["places"] = df["places"].apply(ast.literal_eval)
df["location"] = df["location"].apply(ast.literal_eval)
df["transportation"] = df["transportation"].apply(ast.literal_eval)
df["upload_date"] = pd.to_datetime(df["upload_date"], errors='coerce')

In [55]:
df.isna().sum()

link                    0
price                   0
bedrooms                0
bathrooms               0
area                    0
agency                  0
coordinates             0
facilities              0
upload_date             0
stratum                 0
parking_lots            0
floor                   0
construction_age_min    0
construction_age_max    0
places                  0
location                0
transportation          0
description             0
dtype: int64

In [56]:
df.dtypes

link                    object
price                    int64
bedrooms                 int64
bathrooms                int64
area                     int64
agency                  object
coordinates             object
facilities              object
upload_date             object
stratum                  int64
parking_lots             int64
floor                    int64
construction_age_min     int64
construction_age_max     int64
places                  object
location                object
transportation          object
description             object
dtype: object

### Create Collection

In [57]:
client.create_collection(
    collection_name="apartments_test",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)

True

In [58]:
# Document class to structure data
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

# Convert DataFrame rows into Document objects
def df_to_documents(df):
    documents = []
    for _, row in df.iterrows():
        metadata = {
            "link": row["link"],
            "price": row["price"],
            "bedrooms": row["bedrooms"],
            "bathrooms": row["bathrooms"],
            "area": row["area"],
            "agency": row["agency"],
            "coordinates": row["coordinates"],
            "facilities": row["facilities"],
            "upload_date": row["upload_date"],
            "stratum": row["stratum"],
            "parking_lots": row["parking_lots"],
            "floor": row["floor"],
            "construction_age_min": row["construction_age_min"],
            "construction_age_max": row["construction_age_max"],
            "places": row["places"],
            "location": row["location"],
            "transportation": row["transportation"],
        }
        document = Document(page_content=row["description"], metadata=metadata)
        documents.append(document)
    return documents

docs = df_to_documents(df)

In [59]:
points = [
    models.PointStruct(
        id=idx, 
        vector=encoder.encode(doc.page_content).tolist(), 
        payload={'metadata': doc.metadata, 'page_content': doc.page_content}
    )
    for idx, doc in enumerate(docs)
]

In [60]:
client.upload_points(
    collection_name="apartments_test",
    points=points,
)

In [61]:
# query filter
hits = client.search(
    collection_name="apartments_test",
    query_vector=encoder.encode("Apartamento moderno y minimalista con vista exterior").tolist(),
    query_filter=models.Filter(
        must=[
            models.FieldCondition(key="metadata.agency", match=models.MatchValue(value="Houm")),
            models.FieldCondition(key="metadata.price", range=models.Range(gte=2500000, lte=3500000)), 
            models.FieldCondition(key="metadata.bedrooms", range=models.Range(gte=2, lte=3))
        ]
    ),
    limit=10,
)

for hit in hits:
    print(hit.payload['metadata']['link'], "\nprice:", hit.payload['metadata']['price'], "\agency:", hit.payload['metadata']['agency'], "\abedrooms:", hit.payload['metadata']['bedrooms'],"\n\n")

https://www.fincaraiz.com.co/apartamento-en-arriendo/192403260 
price: 2700000 gency: Houm bedrooms: 3 


https://www.fincaraiz.com.co/apartamento-en-arriendo/192403262 
price: 3000000 gency: Houm bedrooms: 3 


https://www.fincaraiz.com.co/apartamento-en-arriendo/192403048 
price: 2500000 gency: Houm bedrooms: 3 


