In [None]:
# Airbnb Listing Price Analysis Project

In [None]:
# 0) Install & import libraries
!pip install -q xgboost scikit-plot

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

sns.set(style="whitegrid")
%matplotlib inline


In [None]:
# 1) Upload dataset
print("Upload your Airbnb listings CSV (e.g., AB_NYC_2019.csv or equivalent)")
uploaded = files.upload()
if len(uploaded) == 0:
    raise SystemExit("No file uploaded.")
fname = list(uploaded.keys())[0]
df = pd.read_csv(fname)
print("Shape:", df.shape)
df.head()


In [None]:
# 2) Inspect & clean basic things
# Show info & missing values
print(df.info())
print(df.isnull().sum().sort_values(ascending=False).head(20))

# If price stored as string with $ or commas, clean it
if 'price' in df.columns:
    def clean_price(x):
        if pd.isna(x): return x
        if isinstance(x, str):
            return float(x.replace("$","").replace(",","").strip())
        return float(x)
    df['price'] = df['price'].apply(clean_price)


In [None]:
# 3) Exploratory Data Analysis (EDA)

# Histogram of price
plt.figure(figsize=(10,5))
sns.histplot(df['price'].dropna(), bins=60, kde=True)
plt.title("Distribution of Prices")
plt.xlim(0, df['price'].quantile(0.95))
plt.show()

# Price vs room_type
if 'room_type' in df.columns:
    plt.figure(figsize=(8,5))
    sns.boxplot(x=df['room_type'], y=df['price'])
    plt.ylim(0, df['price'].quantile(0.95))
    plt.title("Price by Room Type")
    plt.show()

# Neighbourhood vs average price
if 'neighbourhood' in df.columns:
    neigh_avg = df.groupby('neighbourhood')['price'].mean().sort_values(ascending=False).head(15)
    plt.figure(figsize=(10,5))
    sns.barplot(x=neigh_avg.values, y=neigh_avg.index)
    plt.title("Top 15 Neighbourhoods by Avg Price")
    plt.show()

# Scatter price vs number of reviews
if {'number_of_reviews','price'}.issubset(df.columns):
    plt.figure(figsize=(8,5))
    sns.scatterplot(x='number_of_reviews', y='price', data=df, alpha=0.5)
    plt.xlim(0, df['number_of_reviews'].quantile(0.95))
    plt.ylim(0, df['price'].quantile(0.95))
    plt.show()


In [None]:
# 4) Feature engineering & cleaning

df2 = df.copy()

# Fill missing values
for col in ['reviews_per_month','last_review']:
    if col in df2.columns:
        # For reviews_per_month, median; for last_review maybe convert to datetime
        if col == 'reviews_per_month':
            df2[col] = df2[col].fillna(df2[col].median())
        else:
            df2[col] = pd.to_datetime(df2[col], errors='coerce')
            df2[col+'_year'] = df2[col].dt.year
            df2[col+'_month'] = df2[col].dt.month
            # then maybe drop the original
            df2 = df2.drop(columns=[col])

# Ensure 'accommodates' is present and clean before calculating 'price_per_person'
if 'accommodates' in df2.columns:
    df2 = df2.dropna(subset=['accommodates']) # Drop rows where 'accommodates' is missing
    # Price per room if beds/accommodates are available
    df2['price_per_person'] = df2['price'] / df2['accommodates'].replace({0:1})


# Create distance to city center if lat/long present
if {'latitude','longitude','price'}.issubset(df2.columns):
    # Example: define city center as median lat/long of listings with valid price
    center_lat = df2['latitude'].median()
    center_lon = df2['longitude'].median()
    df2['dist_to_center'] = np.sqrt((df2['latitude'] - center_lat)**2 + (df2['longitude'] - center_lon)**2)

In [None]:
# 5) Clustering / segmentation
features_for_cluster = []
for c in ['price','number_of_reviews','availability_365','reviews_per_month','minimum_nights','dist_to_center']:
    if c in df2.columns:
        features_for_cluster.append(c)

if len(features_for_cluster) >= 2:
    Xc = df2[features_for_cluster].fillna(0)
    scaler = StandardScaler()
    Xc_scaled = scaler.fit_transform(Xc)
    sse = []
    # Limit the maximum number of clusters to the number of samples - 1
    max_k = min(Xc_scaled.shape[0], 7) # Use min with 7 or other reasonable upper bound
    for k in range(2, max_k):
        km = KMeans(n_clusters=k, random_state=42, n_init=10) # Added n_init for newer sklearn versions
        km.fit(Xc_scaled)
        sse.append(km.inertia_)
    plt.figure(figsize=(6,4))
    plt.plot(range(2, max_k), sse, '-o')
    plt.title("Elbow Method for K")
    plt.xlabel("k")
    plt.ylabel("SSE")
    plt.show()

    # example choose k=3 (ensure k is less than number of samples)
    chosen_k = min(3, Xc_scaled.shape[0] -1) # Ensure chosen_k is valid
    if chosen_k >= 2: # Check if clustering is still possible
        km = KMeans(n_clusters=chosen_k, random_state=42, n_init=10) # Added n_init
        df2['cluster'] = km.fit_predict(Xc_scaled)
        # visualize clusters
        if len(features_for_cluster) >= 2:
            plt.figure(figsize=(8,5))
            # Use the scaled features for plotting as they were used for clustering
            sns.scatterplot(x=Xc_scaled[:,0], y=Xc_scaled[:,1], hue=df2['cluster'], palette='Set1')
            plt.title(f"Clusters in first two clustering features (k={chosen_k})")
            plt.show()
    else:
        print("Cannot perform clustering with less than 2 samples after feature selection.")

In [None]:
# 6) Model for price prediction (regression)

# Select useful features
features = []
# numeric candidates
num_cands = ['minimum_nights','number_of_reviews','availability_365','dist_to_center','price_per_person']
for c in num_cands:
    if c in df2.columns:
        features.append(c)
# categorical candidates
cat_cands = ['room_type','neighbourhood','neighbourhood_group','property_type']
for c in cat_cands:
    if c in df2.columns:
        features.append(c)

# Drop rows without price or feature nulls
df_model = df2.dropna(subset=features + ['price'])
print("Model data shape:", df_model.shape)

X = df_model[features]
y = df_model['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: numeric scaling + categorical encoding
num_features = [c for c in features if X[c].dtype in [np.int64, np.float64]]
cat_features = [c for c in features if c not in num_features]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# RandomForest baseline
rf = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print("RF MAE:", mean_absolute_error(y_test, pred_rf))
print("RF RMSE:", np.sqrt(mean_squared_error(y_test, pred_rf)))
print("RF R2:", r2_score(y_test, pred_rf))

In [None]:
# 7) XGBoost model & comparison

# Select useful features (re-selecting to ensure consistency with potential changes in previous cells)
features = []
# numeric candidates
num_cands = ['minimum_nights','number_of_reviews','availability_365','dist_to_center','price_per_person']
for c in num_cands:
    if c in df2.columns:
        features.append(c)
# categorical candidates
cat_cands = ['room_type','neighbourhood','neighbourhood_group','property_type']
for c in cat_cands:
    if c in df2.columns:
        features.append(c)

# Drop rows with any missing values in the selected features AND price
df_model = df2.dropna(subset=features + ['price'])

X = df_model[features]
y = df_model['price']

# Ensure there's enough data after dropping NaNs
if X.shape[0] < 2:
    print("Not enough data after dropping missing values to perform train-test split.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocessing: numeric scaling + categorical encoding (re-defining preprocessor as well)
    num_features = [c for c in features if X[c].dtype in [np.int64, np.float64]]
    cat_features = [c for c in features if c not in num_features]

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

    xgb_pipe = Pipeline([
        ('pre', preprocessor),
        ('model', xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42, n_jobs=1, verbosity=0))
    ])

    xgb_pipe.fit(X_train, y_train)
    pred_xgb = xgb_pipe.predict(X_test)
    print("XGB MAE:", mean_absolute_error(y_test, pred_xgb))
    print("XGB RMSE:", np.sqrt(mean_squared_error(y_test, pred_xgb)))
    print("XGB R2:", r2_score(y_test, pred_xgb))

In [None]:
# 8) Plot actual vs predicted

plt.figure(figsize=(8,6))
plt.scatter(y_test, pred_xgb, alpha=0.5)
plt.plot([0, max(y_test.max(), pred_xgb.max())], [0, max(y_test.max(), pred_xgb.max())], 'r--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price (XGBoost)")
plt.ylim(0, y_test.quantile(0.95))
plt.xlim(0, y_test.quantile(0.95))
plt.show()


In [None]:
# 9) Feature importance from XGB

# Extract feature names after encoding
preprocessor.fit(X_train)
num_names = num_features
cat_names = []
if len(cat_features) > 0:
    ohe = preprocessor.named_transformers_['cat']
    cat_names = list(ohe.get_feature_names_out(cat_features))

all_feat_names = num_names + cat_names

fi = pd.Series(xgb_pipe.named_steps['model'].feature_importances_, index=all_feat_names).sort_values(ascending=False).head(20)
plt.figure(figsize=(8,6))
sns.barplot(x=fi.values, y=fi.index)
plt.title("Top Features Importance (XGB)")
plt.show()

In [None]:
# 10) Save model for reuse
import joblib
joblib.dump(xgb_pipe, "airbnb_price_model.joblib")
print("Saved XGB price model.")
files.download("airbnb_price_model.joblib")
