In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression


csv_path = "/Users/tavishikaushik/Downloads/la-airbnb-cleaned-propertyvaluemerge.csv"


df = pd.read_csv(csv_path)


q99 = df["price"].quantile(0.99)
df = df[df["price"] <= q99]


features = [
    "accommodates", "bedrooms", "beds", "bathrooms",
    "minimum_nights", "number_of_reviews",
    "review_scores_rating", "amenities_count",
    "host_is_superhost", "room_type",
    "property_type", "neighbourhood_cleansed"
]

df_model = df[["price"] + features].dropna().copy()


y = df_model["price"].values
X = df_model[features].copy()


top_neigh = X["neighbourhood_cleansed"].value_counts().index[:15]
X["neighbourhood_cleansed"] = X["neighbourhood_cleansed"].where(
    X["neighbourhood_cleansed"].isin(top_neigh),
    other="OTHER"
)


cat_cols = ["host_is_superhost", "room_type", "property_type", "neighbourhood_cleansed"]
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7. Helper: metrics function
def print_metrics(model_name, y_true_train, y_pred_train, y_true_test, y_pred_test):
    print(f"\n==============================")
    print(f"{model_name} - TRAIN")
    print("==============================")
    print("R²:", r2_score(y_true_train, y_pred_train))
    print("MSE:", mean_squared_error(y_true_train, y_pred_train))
    print("MAE:", mean_absolute_error(y_true_train, y_pred_train))
    print("RMSE:", np.sqrt(mean_squared_error(y_true_train, y_pred_train)))

    print(f"\n{model_name} - TEST")
    print("==============================")
    print("R²:", r2_score(y_true_test, y_pred_test))
    print("MSE:", mean_squared_error(y_true_test, y_pred_test))
    print("MAE:", mean_absolute_error(y_true_test, y_pred_test))
    print("RMSE:", np.sqrt(mean_squared_error(y_true_test, y_pred_test)))


lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

y_train_pred_lin = lin_model.predict(X_train)
y_test_pred_lin = lin_model.predict(X_test)


print_metrics("Linear Regression", y_train, y_train_pred_lin, y_test, y_test_pred_lin)

lin_coef_series = pd.Series(lin_model.coef_, index=X.columns)

print("\n=== Linear Regression: TOP 3 POSITIVE COEFFICIENTS ===")
print(lin_coef_series.sort_values(ascending=False).head(3))

print("\n=== Linear Regression: TOP 3 NEGATIVE COEFFICIENTS ===")
print(lin_coef_series.sort_values().head(3))



Linear Regression - TRAIN
R²: 0.48553093331818653
MSE: 29733.6011221656
MAE: 94.81700210796515
RMSE: 172.434338581866

Linear Regression - TEST
R²: 0.4735718050443193
MSE: 31235.074029866206
MAE: 94.90575705547086
RMSE: 176.7344732356034

=== Linear Regression: TOP 3 POSITIVE COEFFICIENTS ===
property_type_Private room in resort    788.308761
property_type_Entire resort             299.019491
property_type_Castle                    138.580030
dtype: float64

=== Linear Regression: TOP 3 NEGATIVE COEFFICIENTS ===
property_type_Shared room                    -305.264755
property_type_Hostel                         -232.049005
property_type_Private room in nature lodge   -216.060742
dtype: float64


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import Lasso

csv_path = "/Users/tavishikaushik/Downloads/la-airbnb-cleaned-propertyvaluemerge.csv"


df = pd.read_csv(csv_path)

q99 = df["price"].quantile(0.99)
df = df[df["price"] <= q99]


features = [
    "accommodates", "bedrooms", "beds", "bathrooms",
    "minimum_nights", "number_of_reviews",
    "review_scores_rating", "amenities_count",
    "host_is_superhost", "room_type",
    "property_type", "neighbourhood_cleansed"
]

df_model = df[["price"] + features].dropna().copy()


y = df_model["price"].values
X = df_model[features].copy()


top_neigh = X["neighbourhood_cleansed"].value_counts().index[:15]
X["neighbourhood_cleansed"] = X["neighbourhood_cleansed"].where(
    X["neighbourhood_cleansed"].isin(top_neigh),
    other="OTHER"
)


cat_cols = ["host_is_superhost", "room_type", "property_type", "neighbourhood_cleansed"]
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def print_metrics(model_name, y_true_train, y_pred_train, y_true_test, y_pred_test):
    print(f"\n==============================")
    print(f"{model_name} - TRAIN")
    print("==============================")
    print("R²:", r2_score(y_true_train, y_pred_train))
    print("MSE:", mean_squared_error(y_true_train, y_pred_train))
    print("MAE:", mean_absolute_error(y_true_train, y_pred_train))
    print("RMSE:", np.sqrt(mean_squared_error(y_true_train, y_pred_train)))

    print(f"\n{model_name} - TEST")
    print("==============================")
    print("R²:", r2_score(y_true_test, y_pred_test))
    print("MSE:", mean_squared_error(y_true_test, y_pred_test))
    print("MAE:", mean_absolute_error(y_true_test, y_pred_test))
    print("RMSE:", np.sqrt(mean_squared_error(y_true_test, y_pred_test)))


lasso_model = Lasso(alpha=1.0, max_iter=10000)
lasso_model.fit(X_train, y_train)

y_train_pred_lasso = lasso_model.predict(X_train)
y_test_pred_lasso = lasso_model.predict(X_test)


print_metrics("Lasso Regression", y_train, y_train_pred_lasso, y_test, y_test_pred_lasso)


lasso_coef_series = pd.Series(lasso_model.coef_, index=X.columns)

zero_coef = lasso_coef_series[lasso_coef_series == 0]

print("\n=== LASSO: EXAMPLES OF FEATURES ELIMINATED (coef = 0) ===")

if len(zero_coef) >= 3:
    print(zero_coef.sample(3))   
else:
    print(zero_coef)

print("\n=== LASSO: TOP 3 POSITIVE COEFFICIENTS ===")
print(lasso_coef_series.sort_values(ascending=False).head(3))

print("\n=== LASSO: TOP 3 NEGATIVE COEFFICIENTS ===")
print(lasso_coef_series.sort_values().head(3))



Lasso Regression - TRAIN
R²: 0.47445940365590866
MSE: 30373.477196569107
MAE: 94.6075473274132
RMSE: 174.27988178952012

Lasso Regression - TEST
R²: 0.46086070218531006
MSE: 31989.27420874497
MAE: 94.71761943537334
RMSE: 178.85545618947432

=== LASSO: EXAMPLES OF FEATURES ELIMINATED (coef = 0) ===
property_type_Private room in guesthouse           0.0
property_type_Private room                         0.0
property_type_Private room in bed and breakfast    0.0
dtype: float64

=== LASSO: TOP 3 POSITIVE COEFFICIENTS ===
bathrooms                        122.159138
property_type_Room in hotel       47.295022
neighbourhood_cleansed_Venice     37.464537
dtype: float64

=== LASSO: TOP 3 NEGATIVE COEFFICIENTS ===
room_type_Private room               -26.631314
property_type_Private room in home   -18.963404
host_is_superhost_t                  -13.384608
dtype: float64
