In [1]:
# Datei: notebooks/02_feature_engineering.ipynb

import os, sys

# So that we can find the src module when running from the notebooks directory
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.cluster import KMeans

def derive_features(df_listings: pd.DataFrame,
                    df_calendar: pd.DataFrame,
                    df_reviews: pd.DataFrame) -> pd.DataFrame:
    # 1) Distance to the city center
    center = (48.137154, 11.576124)
    
    df_listings["dist_center"] = np.sqrt(
        (df_listings.latitude  - center[0])**2 +
        (df_listings.longitude - center[1])**2
    )

    # 2) Month and weekday features

    df_calendar["date"]    = pd.to_datetime(df_calendar["date"])
    df_calendar["month"]   = df_calendar["date"].dt.month
    df_calendar["weekday"] = df_calendar["date"].dt.weekday
    avail = df_calendar.groupby("listing_id")["available_flag"].mean()
    df_listings["avail_frac"] = df_listings["id"].map(avail)

    # 3) Reviews-Sentiment
    sia = SentimentIntensityAnalyzer()
    df_reviews["sentiment"] = (
        df_reviews["comments"]
        .fillna("")
        .apply(lambda txt: sia.polarity_scores(txt)["compound"])
    )
    rev_agg = (
        df_reviews
        .groupby("listing_id")["sentiment"]
        .agg(sent_mean="mean", rev_count="count")
    )
    df_listings = df_listings.join(rev_agg, on="id")

    # 4) Geo-Clustering
    coords = df_listings[["latitude", "longitude"]]
    km = KMeans(n_clusters=10, random_state=0).fit(coords)
    df_listings["geo_cluster"] = km.labels_

    return df_listings

def main():
    # Base path: one directory above 'notebooks/'
    base = os.path.abspath(os.path.join(os.getcwd(), ".."))

    listings_fp = os.path.join(base, "data", "processed", "listings_clean.csv")
    calendar_fp = os.path.join(base, "data", "processed", "calendar_clean.csv")
    reviews_fp  = os.path.join(base, "data", "raw",       "reviews.gz")

    # Load data
    df_listings = pd.read_csv(listings_fp)
    df_calendar = pd.read_csv(calendar_fp, parse_dates=["date"])
    df_reviews  = pd.read_csv(reviews_fp, compression="gzip")

    # Derive features
    df_feat = derive_features(df_listings, df_calendar, df_reviews)

# Save result
    out_dir = os.path.join(base, "data", "processed", "features")
    os.makedirs(out_dir, exist_ok=True)
    df_feat.to_csv(os.path.join(out_dir, "listings_features.csv"), index=False)

    print("✅ Feature Engineering complete.")

# In Jupyter Notebook, you simply call main() here:
main()



✅ Feature Engineering complete.
