In [None]:
import pandas as pd


df = pd.read_csv("../data/processed/cleaned_data.csv")


df_to_embed = pd.DataFrame({
    "listing_id": range(len(df)), 
    "title": df["title"],
    "subtitle": df["subtitle"],
    "pred_price": df["buy_price"],  
    "shap_top3": "[]"               
})


df_to_embed.to_csv("../data/processed/listings_with_preds.csv", index=False)
print("Created listings_with_preds.csv with", len(df_to_embed), "rows")


Created listings_with_preds.csv with 21707 rows


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
for col in ["neighborhood_id", "house_type_id"]:
    df[col] = df[col].astype("category").cat.codes

# Define features
features = [
    "sq_mt_built", "n_rooms", "n_bathrooms", "neighborhood_id", 
    "house_type_id", "has_terrace", "has_lift", "is_exterior",
    "log_sq_mt_built", "building_age"
]
target = "log_buy_price"


df_model = df[features + [target]].dropna()
X = df_model[features]
y = df_model[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train 
model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print("XGBoost RMSE:", round(rmse, 3))

# Predict on full data and save
df["pred_price"] = np.expm1(model.predict(df[features].fillna(0)))



XGBoost RMSE: 0.222


In [None]:
import shap
import json

explainer = shap.Explainer(model)
shap_values = explainer(X)

top3_idxs = np.argsort(-np.abs(shap_values.values), axis=1)[:, :3]
top3_features = [[features[i] for i in row] for row in top3_idxs]

df_preds = pd.read_csv("../data/processed/listings_with_preds.csv")

df_preds["shap_top3"] = [json.dumps(lst) for lst in top3_features]

df_preds.to_csv("../data/processed/listings_with_preds.csv", index=False)
print("SHAP explanations added and saved")


SHAP explanations added and saved


In [39]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key


In [40]:
from sentence_transformers import SentenceTransformer
import pandas as pd, json, numpy as np, pathlib

df = pd.read_csv("../data/processed/listings_with_preds.csv")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

texts, ids = [], []
for _, r in df.iterrows():
    shap_list = json.loads(r["shap_top3"])  
    txt = (
        f"Title: {r['title']}\n"
        f"Neighbourhood: {r['subtitle']}\n"
        f"Top-3 price drivers: {', '.join(shap_list)}\n"
        f"Predicted price: €{r['pred_price']:,.0f}"
    )
    texts.append(txt)
    ids.append(int(r["listing_id"]))

emb = model.encode(texts, convert_to_numpy=True).astype("float32")

output_dir = "../vectorstore"
pathlib.Path(output_dir).mkdir(exist_ok=True)
np.save(f"{output_dir}/embeddings.npy", emb)
np.save(f"{output_dir}/ids.npy", np.array(ids))

print("Saved", emb.shape[0], "embeddings to", output_dir)


Saved 21707 embeddings to ../vectorstore


In [41]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import pandas as pd, json
from pathlib import Path


df = pd.read_csv("../data/processed/listings_with_preds.csv")


docs = []
for _, r in df.iterrows():
    shap_list = json.loads(r["shap_top3"]) if r["shap_top3"] else []
    text = (
        f"Title: {r['title']}\n"
        f"Neighbourhood: {r['subtitle']}\n"
        f"Top-3 price drivers: {', '.join(shap_list)}\n"
        f"Predicted price: €{r['pred_price']:,.0f}"
    )
    docs.append(Document(page_content=text, metadata={"listing_id": int(r["listing_id"])}))


embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")


vectorstore = FAISS.from_documents(docs, embedding)
vectorstore.save_local(folder_path="../vectorstore", index_name="madrid")

print("Saved LangChain FAISS vectorstore (madrid.faiss + madrid.pkl)")


Saved LangChain FAISS vectorstore (madrid.faiss + madrid.pkl)


In [42]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load FAISS index
index = faiss.read_index("../vectorstore/madrid.faiss")
ids = np.load("../vectorstore/ids.npy")

# Load model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")


query = "modern flat with terrace in chamartín"
q_vec = model.encode(query, convert_to_numpy=True).astype("float32")


faiss.normalize_L2(q_vec.reshape(1, -1))
D, I = index.search(q_vec.reshape(1, -1), k=5)


top_ids = ids[I[0]]
print("Top 5 matching listing IDs:", list(top_ids))


Top 5 matching listing IDs: [15712, 15404, 15021, 15469, 14339]


In [43]:
from langchain_community.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
vectorstore = FAISS.load_local("../vectorstore", embedding, index_name="madrid", allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever()

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)


response = qa.invoke("Why are homes in Chamartín more expensive?")
print("🤖", response["result"])


🤖 Based on the provided context, the predicted prices for homes in Chamartín, Madrid, are influenced by factors such as the square meters built (sq_mt_built), the number of bathrooms (n_bathrooms), and the neighborhood itself (neighborhood_id). The variations in prices could be due to differences in the size of the properties, the number of bathrooms, and potentially the desirability or exclusivity of the Chamartín neighborhood compared to other areas. These factors contribute to the higher prices seen in Chamartín compared to other locations.
