In [None]:
import pandas as pd

# Load your real dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Create the table for embeddings
df_to_embed = pd.DataFrame({
    "listing_id": range(len(df)),  # Generate unique ID from index
    "title": df["title"],
    "subtitle": df["subtitle"],
    "pred_price": df["buy_price"],  # Use actual price
    "shap_top3": "[]"               # placeholder for now
})

# Save it
df_to_embed.to_csv("../data/processed/listings_with_preds.csv", index=False)
print("Created listings_with_preds.csv with", len(df_to_embed), "rows")


✅ Created listings_with_preds.csv with 21707 rows


In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd, json, numpy as np, pathlib

df = pd.read_csv("../data/processed/listings_with_preds.csv")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

texts, ids = [], []
for _, r in df.iterrows():
    shap_list = json.loads(r["shap_top3"])  # even if empty
    txt = (
        f"Title: {r['title']}\n"
        f"Neighbourhood: {r['subtitle']}\n"
        f"Top-3 price drivers: {', '.join(shap_list)}\n"
        f"Predicted price: €{r['pred_price']:,.0f}"
    )
    texts.append(txt)
    ids.append(int(r["listing_id"]))

emb = model.encode(texts, convert_to_numpy=True).astype("float32")

pathlib.Path("vectorstore").mkdir(exist_ok=True)
np.save("vectorstore/embeddings.npy", emb)
np.save("vectorstore/ids.npy", np.array(ids))

print("Saved", emb.shape[0], "embeddings to vectorstore/")


✅ Saved 21707 embeddings to vectorstore/


In [4]:
import numpy as np
import pathlib

texts = []
ids   = []

for _, r in df.iterrows():
    shap_list = json.loads(r["shap_top3"])
    txt = (
        f"Title: {r['title']}\n"
        f"Neighbourhood: {r['subtitle']}\n"
        f"Top-3 price drivers: {', '.join(shap_list)}\n"
        f"Predicted price: €{r['pred_price']:,.0f}"
    )
    texts.append(txt)
    ids.append(int(r["listing_id"]))

emb_matrix = model.encode(texts, convert_to_numpy=True).astype("float32")   # shape (N, 384)

# Save
pathlib.Path("vectorstore").mkdir(exist_ok=True)
np.save("vectorstore/embeddings.npy", emb_matrix)
np.save("vectorstore/ids.npy", np.array(ids))

print("Saved", emb_matrix.shape[0], "embeddings → vectorstore/")


Saved 21707 embeddings → vectorstore/


In [6]:
import faiss
import numpy as np

# Load previously saved embeddings and index
emb = np.load("vectorstore/embeddings.npy")
ids = np.load("vectorstore/ids.npy")
index = faiss.read_index("vectorstore/madrid.faiss")


In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

query = "modern flat with terrace in chamartín"
q_vec = model.encode(query, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(q_vec.reshape(1, -1))

D, I = index.search(q_vec.reshape(1, -1), k=5)
print("Top 5 matching listing IDs:", ids[I[0]])


Top 5 matching listing IDs: [14277 15712 14856 15021 14863]


In [24]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
import pandas as pd

# 1. Load your data
df = pd.read_csv("../data/processed/listings_with_preds.csv")

docs = [
    Document(
        page_content=f"Listing {row.listing_id}: {row.title} in {row.subtitle}, price €{row.pred_price}",
        metadata={"listing_id": int(row.listing_id)}
    )
    for _, row in df.iterrows()
]

# 2. Use local model to avoid OpenAI embedding mismatch
hf_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

# 3. Build the new vectorstore from scratch
vectorstore = FAISS.from_documents(docs, embedding)
retriever = vectorstore.as_retriever()


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")


In [25]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

response = qa.invoke("Why are homes in Chamartín more expensive?")
print("🤖", response["result"])


🤖 Homes in Chamartín may be more expensive due to factors such as the neighborhood's location, amenities, infrastructure, and overall desirability. Additionally, Chamartín is known for being an upscale area in Madrid, which can contribute to higher property prices compared to other neighborhoods.
