In [1]:
# ================================================================
# RentVisionNYC â€” Modeling + Visualization Notebook
# Works when this notebook is inside the Notebooks/ folder.
# ================================================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# ------------------------------------------------
# Plot style: modern academic blue theme
# ------------------------------------------------
plt.style.use("seaborn-v0_8-whitegrid")

PRIMARY = "#1f77b4"   # deep professional blue
ACCENT  = "#2ca02c"   # green accent
GREY    = "#4d4d4d"   # dark grey for text

plt.rcParams.update({
    "figure.figsize": (8, 6),
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "legend.fontsize": 10,
    "axes.edgecolor": "#cccccc",
    "axes.labelcolor": GREY,
    "text.color": GREY
})

# ------------------------------------------------
# Paths (relative to Notebooks/)
# ------------------------------------------------
ROOT = ".."
DATA = os.path.join(ROOT, "Airbnb_Dataset", "processeddata", "nyc_airbnb_clean_with_rent.csv")
FIG_DIR = os.path.join(ROOT, "Reports", "figures")
OUT_METRICS = os.path.join(ROOT, "Reports", "model_metrics.csv")
os.makedirs(FIG_DIR, exist_ok=True)

print("Loading:", DATA)
df = pd.read_csv(DATA, low_memory=False)

# ------------------------------------------------
# Ensure numeric for key fields
# ------------------------------------------------
for c in [
    "price", "service_fee", "minimum_nights", "number_of_reviews",
    "price_per_night", "listing_density"
]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# ------------------------------------------------
# One-hot room_type (dummy vars)
# ------------------------------------------------
room = pd.get_dummies(df["room_type"], prefix="room", dtype=float)
df = pd.concat([df, room], axis=1)

room_entire_col = "room_Entire home/apt"
room_private_col = "room_Private room"
room_shared_col = "room_Shared room"
room_hotel_col  = "room_Hotel room"

# ------------------------------------------------
# Neighborhood-level aggregation
# ------------------------------------------------
grp = df.groupby(["neighbourhood_group", "neighbourhood"], as_index=False)

feat = grp.agg(
    n_listings=("name", "count"),
    avg_price=("price", "mean"),
    avg_min_nights=("minimum_nights", "mean"),
    avg_reviews=("number_of_reviews", "mean"),
    avg_ppn=("price_per_night", "mean"),
    avg_service_fee=("service_fee", "mean"),
    density=("listing_density", "mean"),
)

# Add room shares
feat["room_entire"]  = grp[room_entire_col].mean().get(room_entire_col, pd.Series(0.0))
feat["room_private"] = grp[room_private_col].mean().get(room_private_col, pd.Series(0.0))
feat["room_shared"]  = grp[room_shared_col].mean().get(room_shared_col, pd.Series(0.0))
feat["room_hotel"]   = grp[room_hotel_col].mean().get(room_hotel_col,  pd.Series(0.0))

# Target (median_rent) at neighborhood level
rent = grp["median_rent"].first().rename(columns={"median_rent": "median_rent"})
data = pd.merge(feat, rent, on=["neighbourhood_group", "neighbourhood"], how="left")
data = data.dropna(subset=["median_rent"]).reset_index(drop=True)

print("Neighborhood rows:", len(data))
print("Columns:", list(data.columns))

# ================================================================
# VISUALIZATIONS (presentation-ready)
# ================================================================

# --- Scatter: Listings vs Rent ---
fig, ax = plt.subplots()
ax.scatter(
    data["n_listings"],
    data["median_rent"],
    color=PRIMARY,
    alpha=0.7,
    edgecolor="white",
    linewidth=0.5,
    s=50,
)
ax.set_xlabel("Airbnb Listings per Neighborhood")
ax.set_ylabel("Median Rent (USD)")
ax.set_title("Listings vs. Median Rent (Neighborhood Level)")
fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, "scatter_listings_vs_rent.png"), dpi=300)
plt.close(fig)

# --- Bar: Average Rent by Borough ---
fig, ax = plt.subplots()
borough_means = data.groupby("neighbourhood_group")["median_rent"].mean().sort_values()

ax.bar(
    borough_means.index,
    borough_means.values,
    color=PRIMARY,
    edgecolor="white",
)
ax.set_ylabel("Average Median Rent (USD)")
ax.set_xlabel("Borough")
ax.set_title("Average Median Rent by Borough")
ax.tick_params(axis="x", rotation=20)
fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, "bar_rent_by_borough.png"), dpi=300)
plt.close(fig)

# ================================================================
# MODELING
# ================================================================

X_cols = [
    "n_listings", "avg_price", "avg_service_fee", "avg_min_nights",
    "avg_reviews", "avg_ppn", "density",
    "room_entire", "room_private", "room_shared", "room_hotel"
]

X = data[X_cols].replace([np.inf, -np.inf], np.nan).fillna(0).astype(float)
y = data["median_rent"].astype(float).values

finite_mask = np.isfinite(y) & np.isfinite(X).all(axis=1)
X = X[finite_mask]
y = y[finite_mask]

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, random_state=42)

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def evaluate(name, model):
    """Fit model, print metrics, and return dict + predictions on test."""
    model.fit(Xtr, ytr)
    p_tr = model.predict(Xtr)
    p_te = model.predict(Xte)
    out = {
        "model": name,
        "R2_train": r2_score(ytr, p_tr),
        "R2_test":  r2_score(yte, p_te),
        "RMSE_train": rmse(ytr, p_tr),
        "RMSE_test":  rmse(yte, p_te),
    }
    print(out)
    return out, p_te

all_results = []

# -----------------------
# 1) Error-based models
# -----------------------
lin = make_pipeline(StandardScaler(), LinearRegression())
ridge = make_pipeline(StandardScaler(), Ridge(alpha=10.0))
rf_base = RandomForestRegressor(n_estimators=300, random_state=42)

res_lin,   _     = evaluate("Linear+Scale", lin)
res_ridge, _     = evaluate("Ridge(10)+Scale", ridge)
res_rf_base, _   = evaluate("RandomForest(300)", rf_base)

all_results.extend([res_lin, res_ridge, res_rf_base])

# -----------------------
# 2) Similarity-based: KNN
# -----------------------
knn = KNeighborsRegressor(n_neighbors=5)
res_knn, knn_pred = evaluate("KNN (k=5)", knn)
all_results.append(res_knn)

# Plot KNN actual vs predicted
fig, ax = plt.subplots()
ax.scatter(
    yte,
    knn_pred,
    color=PRIMARY,
    alpha=0.7,
    edgecolor="white",
    linewidth=0.5,
    s=50,
)
ax.plot(
    [yte.min(), yte.max()],
    [yte.min(), yte.max()],
    color=ACCENT,
    linestyle="--",
    linewidth=1.2,
    label="Perfect prediction",
)
ax.set_xlabel("Actual Rent (USD)")
ax.set_ylabel("Predicted Rent (USD)")
ax.set_title("KNN (k=5): Actual vs Predicted Median Rent")
ax.legend()
fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, "knn_actual_vs_pred.png"), dpi=300)
plt.close(fig)

# -----------------------
# 3) Probability-based: Gaussian Naive Bayes
# (Not ideal for regression, but included to satisfy category requirement)
# -----------------------
nb = GaussianNB()
res_nb, nb_pred = evaluate("Gaussian Naive Bayes", nb)
all_results.append(res_nb)

fig, ax = plt.subplots()
ax.scatter(
    yte,
    nb_pred,
    color=PRIMARY,
    alpha=0.7,
    edgecolor="white",
    linewidth=0.5,
    s=50,
)
ax.plot(
    [yte.min(), yte.max()],
    [yte.min(), yte.max()],
    color=ACCENT,
    linestyle="--",
    linewidth=1.2,
    label="Perfect prediction",
)
ax.set_xlabel("Actual Rent (USD)")
ax.set_ylabel("Predicted Rent (USD)")
ax.set_title("Naive Bayes: Actual vs Predicted Median Rent")
ax.legend()
fig.tight_layout()
fig.savefig(os.path.join(FIG_DIR, "nb_actual_vs_pred.png"), dpi=300)
plt.close(fig)

# ================================================================
# HYPERPARAMETER TUNING (GridSearchCV)
# ================================================================

# --- Tune Random Forest ---
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}
rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid=rf_param_grid,
    cv=5,
    n_jobs=-1,
    scoring="r2",
)
rf_grid.fit(Xtr, ytr)
rf_best = rf_grid.best_estimator_
print("Best RF params:", rf_grid.best_params_)

res_rf_best, _ = evaluate("RandomForest (tuned)", rf_best)
all_results.append(res_rf_best)

# --- Tune KNN (k) ---
knn_param_grid = {"n_neighbors": [3, 5, 7, 9, 11]}
knn_grid = GridSearchCV(
    KNeighborsRegressor(),
    param_grid=knn_param_grid,
    cv=5,
    n_jobs=-1,
    scoring="r2",
)
knn_grid.fit(Xtr, ytr)
knn_best = knn_grid.best_estimator_
print("Best KNN params:", knn_grid.best_params_)

res_knn_best, _ = evaluate("KNN (tuned)", knn_best)
all_results.append(res_knn_best)

# ================================================================
# SAVE METRICS + FEATURE IMPORTANCE
# ================================================================
metrics = pd.DataFrame(all_results).sort_values("R2_test", ascending=False)
metrics.to_csv(OUT_METRICS, index=False)
print("\nSaved metrics to:", OUT_METRICS)
display(metrics)

# --- Feature importance for best RF ---
try:
    imp = pd.Series(rf_best.feature_importances_, index=X_cols).sort_values(ascending=False)

    fig, ax = plt.subplots()
    ax.bar(
        imp.index,
        imp.values,
        color=PRIMARY,
        edgecolor="white",
    )
    ax.set_ylabel("Importance (Gini)")
    ax.set_xlabel("Feature")
    ax.set_title("Random Forest Feature Importance")
    ax.tick_params(axis="x", rotation=30)
    fig.tight_layout()
    out_path = os.path.join(FIG_DIR, "rf_feature_importance.png")
    fig.savefig(out_path, dpi=300)
    plt.close(fig)
    print("Saved RF feature importance to:", out_path)
    display(imp)
except Exception as e:
    print("Feature importance not available:", e)


Loading: ../Airbnb_Dataset/processeddata/nyc_airbnb_clean_with_rent.csv
Neighborhood rows: 224
Columns: ['neighbourhood_group', 'neighbourhood', 'n_listings', 'avg_price', 'avg_min_nights', 'avg_reviews', 'avg_ppn', 'avg_service_fee', 'density', 'room_entire', 'room_private', 'room_shared', 'room_hotel', 'median_rent']
{'model': 'Linear+Scale', 'R2_train': 0.29517332173903454, 'R2_test': 0.226822505108059, 'RMSE_train': 514.3284542477669, 'RMSE_test': 641.7204666193688}
{'model': 'Ridge(10)+Scale', 'R2_train': 0.29471332880591206, 'R2_test': 0.22323371477251264, 'RMSE_train': 514.4962606553335, 'RMSE_test': 643.2080512939648}
{'model': 'RandomForest(300)', 'R2_train': 0.9346689539719462, 'R2_test': 0.4445187518574343, 'RMSE_train': 156.5882571431596, 'RMSE_test': 543.9274664784657}
{'model': 'KNN (k=5)', 'R2_train': 0.6010678790441955, 'R2_test': 0.2547738923621973, 'RMSE_train': 386.9446865847759, 'RMSE_test': 630.0141721761956}
{'model': 'Gaussian Naive Bayes', 'R2_train': 0.47358056

Unnamed: 0,model,R2_train,R2_test,RMSE_train,RMSE_test
5,RandomForest (tuned),0.901427,0.448704,192.344192,541.874586
2,RandomForest(300),0.934669,0.444519,156.588257,543.927466
6,KNN (tuned),0.539155,0.280125,415.888539,619.205429
3,KNN (k=5),0.601068,0.254774,386.944687,630.014172
0,Linear+Scale,0.295173,0.226823,514.328454,641.720467
1,Ridge(10)+Scale,0.294713,0.223234,514.496261,643.208051
4,Gaussian Naive Bayes,0.473581,0.155446,444.493425,670.687281


Saved RF feature importance to: ../Reports/figures/rf_feature_importance.png


n_listings         0.278922
density            0.254290
avg_min_nights     0.101838
room_private       0.095674
avg_ppn            0.080419
room_entire        0.076676
avg_reviews        0.050301
avg_price          0.041275
room_shared        0.012706
room_hotel         0.007899
avg_service_fee    0.000000
dtype: float64