In [69]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

This notebooks uses the [New York City Airbnb Open Data](https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data).

In [16]:
df = pd.read_csv("/kaggle/input/new-york-city-airbnb-open-data/AB_NYC_2019.csv")

In [17]:
df.head()

In [18]:
df.info()

In [19]:
df.describe()

## EDA

In [20]:
sns.histplot(df.price, bins=40);

In [21]:
sns.histplot(x='price', data=df[df.price < 800]);

In [22]:
sns.histplot(np.log1p(df.price));

In [42]:
sns.histplot(df.minimum_nights, bins=40);

In [23]:
sns.countplot(y='room_type', data=df);

In [24]:
sns.countplot(y='neighbourhood_group', data=df);

In [25]:
print("Number of different neighbourhoods: ", df.neighbourhood.nunique())

In [78]:
neigh = df.neighbourhood.value_counts().to_frame()
neigh.describe()

In [26]:
sns.histplot(df.availability_365, bins=40);

In [27]:
numeric_cols_corr = ["price", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365"]

In [28]:
corr = df[numeric_cols_corr].corr()

sns.heatmap(corr, vmin=0, vmax=1, linewidths=2, square=True, cmap='Blues')

In [43]:
df[df.minimum_nights > 1000].head()

In [44]:
df[df.minimum_nights > 365].head()

## Baseline model

For our first model, we choose a few numeric columns.

In [32]:
numeric_cols = ["minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365"]

In [33]:
# Check for missing values
df[numeric_cols].isnull().sum()

In [36]:
X = df[numeric_cols].copy()
y = np.log1p(df.price)

In [45]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8,
                                                      test_size=0.2, random_state=42)

In [46]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [47]:
y_predict = lr.predict(X_valid)

rmse = mean_squared_error(y_valid, y_predict, squared=False)
print(f"RMSE: {rmse}")

## Adding categoric columns

Let's try adding more columns and see how the model performs.

In [50]:
categoric_cols = ["neighbourhood_group", "room_type"]

In [58]:
X = df[numeric_cols + categoric_cols]
y = np.log1p(df.price)

In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8,
                                                      test_size=0.2, random_state=42)

In [70]:
cat_transformer = OneHotEncoder()

standsc_transformer = StandardScaler()

In [71]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, categoric_cols),
        ('num', standsc_transformer, numeric_cols)
    ]
)

lr = LinearRegression()

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('lr', lr)
    ]
)

In [72]:
pipeline.fit(X_train, y_train)

In [73]:
y_predict = pipeline.predict(X_valid)

rmse = mean_squared_error(y_valid, y_predict, squared=False)
print(f"RMSE: {rmse}")

## Try XGBRegressor

In [80]:
X = df[numeric_cols + categoric_cols]
y = np.log1p(df.price)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8,
                                                      test_size=0.2, random_state=42)


cat_transformer = OneHotEncoder()

standsc_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, categoric_cols),
        ('num', standsc_transformer, numeric_cols)
    ]
)

xgb = XGBRegressor()

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('xgb', xgb)
    ]
)

pipeline.fit(X_train, y_train)

y_predict = pipeline.predict(X_valid)

rmse = mean_squared_error(y_valid, y_predict, squared=False)
print(f"RMSE: {rmse}")

## Feature engineering