In [12]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Create the table for embeddings
df_to_embed = pd.DataFrame({
    "listing_id": range(len(df)),  # Generate unique ID from index
    "title": df["title"],
    "subtitle": df["subtitle"],
    "pred_price": df["buy_price"],  # Use actual price
    "shap_top3": "[]"               # placeholder for now
})

# Save it
df_to_embed.to_csv("../data/processed/listings_with_preds.csv", index=False)
print("Created listings_with_preds.csv with", len(df_to_embed), "rows")


Created listings_with_preds.csv with 21707 rows


In [13]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
for col in ["neighborhood_id", "house_type_id"]:
    df[col] = df[col].astype("category").cat.codes

# Define features and target
features = [
    "sq_mt_built", "n_rooms", "n_bathrooms", "neighborhood_id", 
    "house_type_id", "has_terrace", "has_lift", "is_exterior",
    "log_sq_mt_built", "building_age"
]
target = "log_buy_price"

# Prepare data
df_model = df[features + [target]].dropna()
X = df_model[features]
y = df_model[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost regressor
model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print("XGBoost RMSE:", round(rmse, 3))

# Predict on full data and save
df["pred_price"] = np.expm1(model.predict(df[features].fillna(0)))



XGBoost RMSE: 0.222


In [14]:
import shap
import json

# 1. Create SHAP explainer
explainer = shap.Explainer(model)
shap_values = explainer(X)

# 2. Get top-3 features per listing (by absolute SHAP value)
top3_idxs = np.argsort(-np.abs(shap_values.values), axis=1)[:, :3]
top3_features = [[features[i] for i in row] for row in top3_idxs]

# 3. Reload the listings_with_preds file
df_preds = pd.read_csv("../data/processed/listings_with_preds.csv")

# 4. Add SHAP top 3 as a stringified JSON list
df_preds["shap_top3"] = [json.dumps(lst) for lst in top3_features]

# 5. Save it again
df_preds.to_csv("../data/processed/listings_with_preds.csv", index=False)
print("SHAP explanations added and saved")


SHAP explanations added and saved


In [15]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key


In [16]:
from sentence_transformers import SentenceTransformer
import pandas as pd, json, numpy as np, pathlib

df = pd.read_csv("../data/processed/listings_with_preds.csv")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

texts, ids = [], []
for _, r in df.iterrows():
    shap_list = json.loads(r["shap_top3"])  # even if empty
    txt = (
        f"Title: {r['title']}\n"
        f"Neighbourhood: {r['subtitle']}\n"
        f"Top-3 price drivers: {', '.join(shap_list)}\n"
        f"Predicted price: €{r['pred_price']:,.0f}"
    )
    texts.append(txt)
    ids.append(int(r["listing_id"]))

emb = model.encode(texts, convert_to_numpy=True).astype("float32")

output_dir = "../vectorstore"
pathlib.Path(output_dir).mkdir(exist_ok=True)
np.save(f"{output_dir}/embeddings.npy", emb)
np.save(f"{output_dir}/ids.npy", np.array(ids))

print("Saved", emb.shape[0], "embeddings to", output_dir)


Saved 21707 embeddings to ../vectorstore


In [19]:
import faiss


index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb)

faiss.write_index(index, "../vectorstore/madrid.faiss")
print("FAISS index saved to ../vectorstore/madrid.faiss")


FAISS index saved to ../vectorstore/madrid.faiss


In [20]:
import faiss
import numpy as np
output_dir = "../vectorstore"

# Load saved embeddings and index
emb = np.load(f"{output_dir}/embeddings.npy")
ids = np.load(f"{output_dir}/ids.npy")
index = faiss.read_index(f"{output_dir}/madrid.faiss")


In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

query = "modern flat with terrace in chamartín"
q_vec = model.encode(query, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(q_vec.reshape(1, -1))

D, I = index.search(q_vec.reshape(1, -1), k=5)
print("Top 5 matching listing IDs:", ids[I[0]])


Top 5 matching listing IDs: [15712 15404 15021 15469 14339]


In [22]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
import pandas as pd


df = pd.read_csv("../data/processed/listings_with_preds.csv")

docs = [
    Document(
        page_content=f"Listing {row.listing_id}: {row.title} in {row.subtitle}, price €{row.pred_price}",
        metadata={"listing_id": int(row.listing_id)}
    )
    for _, row in df.iterrows()
]


hf_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")


vectorstore = FAISS.from_documents(docs, embedding)
retriever = vectorstore.as_retriever()


In [23]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

response = qa.invoke("Why are homes in Chamartín more expensive?")
print("🤖", response["result"])


🤖 Homes in Chamartín may be more expensive due to factors such as the neighborhood's location, amenities, infrastructure, demand, and overall desirability. Additionally, specific features of the properties, such as size, condition, and unique characteristics, can also influence the pricing.
