# Project Code

In [95]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', None)

### Data Preprocessing - Krishna

In [64]:
# Load Data
data = pd.read_csv('Airbnb_Open_Data.csv',low_memory=False)
data.head()

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,US,False,strict,Private room,2020.0,$966,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,US,False,moderate,Entire home/apt,2007.0,$142,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,US,True,flexible,Private room,2005.0,$620,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,US,True,moderate,Entire home/apt,2005.0,$368,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,US,False,moderate,Entire home/apt,2009.0,$204,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [82]:
# Standardize Column Names
data.columns = [col.lower().replace(' ', '_') if len(col.split()) >= 2 else col.lower() for col in data.columns]

# Drop Irrelevant Columns
irrelevant_columns = ['id', 'name', 'host_id', 'host_name', 'license', 'house_rules', 
                      'country', 'country_code', 'lat', 'long', 'service_fee']
data_cleaned = data.drop(columns=irrelevant_columns)

# Clean Price and Service Fee Data
data_cleaned['price'] = data_cleaned['price'].str.replace('[^\d.]', '', regex=True).astype(float)

# Store Price seperately
response = data_cleaned['price']
data_cleaned = data_cleaned.drop(['price'], axis=1)
response = response.interpolate(method='linear')

# Derive days_since_last_review from last_review 
data_cleaned['last_review'] = pd.to_datetime(data_cleaned['last_review'], errors='coerce')
reference_date = datetime.now()
data_cleaned['days_since_last_review'] = (reference_date - data_cleaned['last_review']).dt.days
data_cleaned['days_since_last_review'].fillna(9999, inplace=True)
data_cleaned.drop(columns=['last_review'], inplace=True)

# Derive years_since_construction from construction_year
current_year = datetime.now().year
data_cleaned['years_since_construction'] = current_year - data_cleaned['construction_year']
data_cleaned.drop(columns=['construction_year'], inplace=True)

# Clean neighbourhood_group data
correct_mapping = {'brookln': 'Brooklyn','manhatan': 'Manhattan'}
data_cleaned['neighbourhood_group'] = data_cleaned['neighbourhood_group'].replace(correct_mapping)

# Impute Numerical Missing Data Using Linear Interpolation
numeric_columns = data_cleaned.select_dtypes(include=['float64']).columns
data_cleaned[numeric_columns] = data_cleaned[numeric_columns].apply(lambda col: col.interpolate(method='linear'))

# # Scale Numeric Data
# scaler = StandardScaler()
# data_cleaned[numeric_columns] = scaler.fit_transform(data_cleaned[numeric_columns])
# data_cleaned[numeric_columns] = scaler.transform(data_cleaned[numeric_columns])

# Impute Categorical Missing Data Using Mode Imputation
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].astype('category')
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].apply(lambda col: col.fillna(col.mode()[0]))

data_cleaned.head()

Unnamed: 0,host_identity_verified,neighbourhood_group,neighbourhood,instant_bookable,cancellation_policy,room_type,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,days_since_last_review,years_since_construction
0,unconfirmed,Brooklyn,Kensington,False,strict,Private room,10.0,9.0,0.21,4.0,6.0,286.0,1129.0,4.0
1,verified,Manhattan,Midtown,False,moderate,Entire home/apt,30.0,45.0,0.38,4.0,2.0,228.0,915.0,17.0
2,unconfirmed,Manhattan,Harlem,True,flexible,Private room,3.0,0.0,2.51,5.0,1.0,352.0,9999.0,19.0
3,unconfirmed,Brooklyn,Clinton Hill,True,moderate,Entire home/apt,30.0,270.0,4.64,4.0,1.0,322.0,1966.0,19.0
4,verified,Manhattan,East Harlem,False,moderate,Entire home/apt,10.0,9.0,0.1,3.0,1.0,289.0,2194.0,15.0


In [83]:
data_cleaned.dtypes

host_identity_verified            category
neighbourhood_group               category
neighbourhood                     category
instant_bookable                  category
cancellation_policy               category
room_type                         category
minimum_nights                     float64
number_of_reviews                  float64
reviews_per_month                  float64
review_rate_number                 float64
calculated_host_listings_count     float64
availability_365                   float64
days_since_last_review             float64
years_since_construction           float64
dtype: object

##### Notes for Harshita

- data_cleaned: Data with all features without encoding, and response (price). Can be used for EDA and Outlier Detection
- data_encoded: OneHotEncoded data. Not meaningful for outlier detection, EDA, or feature selection. Purely for model inputs. This step needs to be done after EDA, outlier detection, and feature selection is completed.
- I also removed useless features such as 'house_rules', 'country', 'country_code', 'lat', 'long' as country and country code for the entire dataset were United States. I removed lat and long as they are raw coordinates that have no significance without context. I removed house rules as it is pure paragraphical text data which we cannot process for a forecasting task. I removed service fee as it is already included in the price (100% correlation to response).

### EDA - Harshita

### Outlier Detection - Harshita

### Train Test Split - Krishna

In [84]:
# OneHotEncoding for Categorical Variables for Model Compatibility
data_encoded = pd.get_dummies(data_cleaned, drop_first=True)

# Store Features in X
X = data_encoded

# Store Response Variables in y
y = response

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Selection - Krishna

In [85]:
# Mutual Information Initialization
mi_scores = mutual_info_regression(X_train, y_train)
mi_scores_series = pd.Series(mi_scores, index=X_train.columns)

# Thresholds to test
thresholds = [0.001, 0.005, 0.01]

# Initialize variables to track the best thresholds and scores
best_mi_t = 0
best_mi_score = 0
best_xgb_t = 0
best_xgb_score = 0
best_features_mi = []
best_features_xgb = []

# Loop through thresholds for both MI and XGBoost feature selection
for t in thresholds:
    print(f'Threshold: {t}')
    print('-' * 30)
    
    # Mutual Information Feature Selection
    mi_selected_features = mi_scores_series[mi_scores_series > t].index
    X_train_mi = X_train[mi_selected_features]
    print(f"MI Features Selected: {len(mi_selected_features)}")

    # Evaluate using cross-validation after MI selection
    mi_scores = cross_val_score(XGBRegressor(random_state=42), X_train_mi, y_train, cv=5, scoring='r2')
    mi_mean_score = mi_scores.mean()

    # Update best MI threshold and score and save features
    if mi_mean_score > best_mi_score:
        best_mi_score = mi_mean_score
        best_mi_t = t
        best_features_mi = mi_selected_features.tolist()

    print(f"MI Mean CV Score: {mi_mean_score:.4f}")

    # XGBoost Feature Importance Refinement
    xgb_model = XGBRegressor(random_state=42)
    xgb_model.fit(X_train_mi, y_train)
    xgb_importances = pd.Series(xgb_model.feature_importances_, index=X_train_mi.columns)

    # Apply XGBoost threshold to further refine features
    xgb_selected_features = xgb_importances[xgb_importances > t].index
    X_train_xgb = X_train_mi[xgb_selected_features]
    print(f"XGBoost Features Selected: {len(xgb_selected_features)}")

    # Evaluate using cross-validation after XGBoost refinement
    xgb_scores = cross_val_score(XGBRegressor(random_state=42), X_train_xgb, y_train, cv=5, scoring='r2')
    xgb_mean_score = xgb_scores.mean()

    # Update best XGBoost threshold and score and save features
    if xgb_mean_score > best_xgb_score:
        best_xgb_score = xgb_mean_score
        best_xgb_t = t
        best_features_xgb = xgb_selected_features.tolist()

    print(f"XGBoost Mean CV Score: {xgb_mean_score:.4f}")
    print('-' * 30)

# Final Results
print(f"Best MI Threshold: {best_mi_t}, Best MI Mean CV Score: {best_mi_score:.4f}")
print(f"Best XGBoost Threshold: {best_xgb_t}, Best XGBoost Mean CV Score: {best_xgb_score:.4f}")

Threshold: 0.001
------------------------------
MI Features Selected: 137
MI Mean CV Score: 0.0257
XGBoost Features Selected: 131
XGBoost Mean CV Score: 0.0261
------------------------------
Threshold: 0.005
------------------------------
MI Features Selected: 31
MI Mean CV Score: 0.0278
XGBoost Features Selected: 31
XGBoost Mean CV Score: 0.0278
------------------------------
Threshold: 0.01
------------------------------
MI Features Selected: 8
MI Mean CV Score: 0.0267
XGBoost Features Selected: 8
XGBoost Mean CV Score: 0.0267
------------------------------
Best MI Threshold: 0.005, Best MI Mean CV Score: 0.0278
Best XGBoost Threshold: 0.005, Best XGBoost Mean CV Score: 0.0278


In [86]:
# Select Best Features from Training and Test Sets
X_train_final = X_train[best_features_xgb]
X_test_final = X_test[best_features_xgb]

print("\nFeatures selected by Mutual Information and XGBoost:\n" + "\n".join(best_features_xgb))


Features selected by Mutual Information and XGBoost:
minimum_nights
number_of_reviews
reviews_per_month
review_rate_number
calculated_host_listings_count
availability_365
days_since_last_review
years_since_construction
neighbourhood_group_Brooklyn
neighbourhood_Bedford-Stuyvesant
neighbourhood_Canarsie
neighbourhood_Castleton Corners
neighbourhood_Clinton Hill
neighbourhood_Crown Heights
neighbourhood_East Village
neighbourhood_Edenwald
neighbourhood_Flatbush
neighbourhood_Fort Hamilton
neighbourhood_Fort Wadsworth
neighbourhood_Greenpoint
neighbourhood_Kips Bay
neighbourhood_Long Island City
neighbourhood_Midtown
neighbourhood_Murray Hill
neighbourhood_Park Slope
neighbourhood_Red Hook
neighbourhood_Soundview
neighbourhood_Upper East Side
neighbourhood_Upper West Side
neighbourhood_Williamsburg
room_type_Shared room


In [87]:
X_train_final.shape, X_test_final.shape

((82079, 31), (20520, 31))

In [88]:
X_train_final.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,days_since_last_review,years_since_construction,neighbourhood_group_Brooklyn,neighbourhood_Bedford-Stuyvesant,neighbourhood_Canarsie,neighbourhood_Castleton Corners,neighbourhood_Clinton Hill,neighbourhood_Crown Heights,neighbourhood_East Village,neighbourhood_Edenwald,neighbourhood_Flatbush,neighbourhood_Fort Hamilton,neighbourhood_Fort Wadsworth,neighbourhood_Greenpoint,neighbourhood_Kips Bay,neighbourhood_Long Island City,neighbourhood_Midtown,neighbourhood_Murray Hill,neighbourhood_Park Slope,neighbourhood_Red Hook,neighbourhood_Soundview,neighbourhood_Upper East Side,neighbourhood_Upper West Side,neighbourhood_Williamsburg,room_type_Shared room
85201,1.0,87.0,2.28,1.0,4.0,28.0,1971.0,12.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
87221,3.0,0.0,1.635,4.0,1.0,0.0,9999.0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
16509,5.0,3.0,0.08,1.0,2.0,344.0,1969.0,12.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
51206,2.0,15.0,3.02,2.0,1.0,102.0,998.0,2.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20055,2.0,4.0,0.13,4.0,1.0,0.0,2374.0,11.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [89]:
X_test_final.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,days_since_last_review,years_since_construction,neighbourhood_group_Brooklyn,neighbourhood_Bedford-Stuyvesant,neighbourhood_Canarsie,neighbourhood_Castleton Corners,neighbourhood_Clinton Hill,neighbourhood_Crown Heights,neighbourhood_East Village,neighbourhood_Edenwald,neighbourhood_Flatbush,neighbourhood_Fort Hamilton,neighbourhood_Fort Wadsworth,neighbourhood_Greenpoint,neighbourhood_Kips Bay,neighbourhood_Long Island City,neighbourhood_Midtown,neighbourhood_Murray Hill,neighbourhood_Park Slope,neighbourhood_Red Hook,neighbourhood_Soundview,neighbourhood_Upper East Side,neighbourhood_Upper West Side,neighbourhood_Williamsburg,room_type_Shared room
48202,20.0,0.0,1.0,4.0,1.0,30.0,9999.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
94193,2.0,41.0,2.79,5.0,5.0,169.0,2003.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
90387,60.0,13.0,0.67,3.0,4.0,0.0,2290.0,14.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
89198,3.0,2.0,0.07,5.0,1.0,363.0,2737.0,18.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8266,2.0,48.0,0.97,4.0,1.0,223.0,1971.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Linear Regression - Krishna

#### Model Evaluation

In [90]:
X_train_final_const = sm.add_constant(X_train_final)
X_test_final_const = sm.add_constant(X_test_final)

ols_model = sm.OLS(y_train, X_train_final_const).fit()

y_pred = ols_model.predict(X_test_final_const)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(ols_model.summary(),'\n')
print('='*25)
print("Test Set Performance:")
print('='*25)
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R-squared: {ols_model.rsquared:.2f}")
print('='*25)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.020
Date:                Thu, 21 Nov 2024   Prob (F-statistic):           0.000657
Time:                        10:39:39   Log-Likelihood:            -5.9286e+05
No. Observations:               82079   AIC:                         1.186e+06
Df Residuals:                   82047   BIC:                         1.186e+06
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

## Random Forest - Krishna

In [97]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_final, y_train)

print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  22.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.7min




[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  45.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  45.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=  22.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  44.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  23.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  48.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.5min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=12.0min
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=21.6min
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_esti

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  46.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.6min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  23.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  46.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=  23.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 7.2min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=27.6min
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 5.8min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=15.1min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  45.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  46.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=  22.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  46.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  23.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 7.3min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=  20.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=11.2min
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  36.3s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_esti

In [98]:
rf_model = RandomForestRegressor(
    n_estimators=grid_search.best_params_['n_estimators'],
    random_state=42,
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    min_samples_leaf=grid_search.best_params_['min_samples_leaf']
)

rf_model.fit(X_train_final, y_train)

y_pred = rf_model.predict(X_test_final)

#### Model Evaluation

In [99]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Random Forest Test Performance:")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

Random Forest Test Performance:
MSE: 67310.61
RMSE: 259.44
R-squared: 0.38


## XGBoost - Krishna

#### Model Evaluation

## Meta Model

#### Model Evaluation