# Project Code

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from xgboost import XGBRegressor
pd.set_option('display.max_columns', None)

### Data Preprocessing - Krishna

In [2]:
# Load Data
data = pd.read_csv('Airbnb_Open_Data.csv',low_memory=False)
data.head()

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,US,False,strict,Private room,2020.0,$966,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,US,False,moderate,Entire home/apt,2007.0,$142,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,US,True,flexible,Private room,2005.0,$620,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,US,True,moderate,Entire home/apt,2005.0,$368,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,US,False,moderate,Entire home/apt,2009.0,$204,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [3]:
# Standardize Column Names
data.columns = [col.lower().replace(' ', '_') if len(col.split()) >= 2 else col.lower() for col in data.columns]

# Drop Irrelevant Columns
irrelevant_columns = ['id', 'name', 'host_id', 'host_name', 'license', 'house_rules', 
                      'country', 'country_code', 'lat', 'long', 'service_fee']
data_cleaned = data.drop(columns=irrelevant_columns)

# Clean Price and Service Fee Data
data_cleaned['price'] = data_cleaned['price'].str.replace('[^\d.]', '', regex=True).astype(float)

# Derive days_since_last_review from last_review 
data_cleaned['last_review'] = pd.to_datetime(data_cleaned['last_review'], errors='coerce')
reference_date = datetime.now()
data_cleaned['days_since_last_review'] = (reference_date - data_cleaned['last_review']).dt.days
data_cleaned['days_since_last_review'].fillna(9999, inplace=True)
data_cleaned.drop(columns=['last_review'], inplace=True)

# Derive years_since_construction from construction_year
current_year = datetime.now().year
data_cleaned['years_since_construction'] = current_year - data_cleaned['construction_year']
data_cleaned.drop(columns=['construction_year'], inplace=True)

# Clean neighbourhood_group data
correct_mapping = {'brookln': 'Brooklyn','manhatan': 'Manhattan'}
data_cleaned['neighbourhood_group'] = data_cleaned['neighbourhood_group'].replace(correct_mapping)

# Impute Numerical Missing Data Using Linear Interpolation
numeric_columns = data_cleaned.select_dtypes(include=['float64']).columns
data_cleaned[numeric_columns] = data_cleaned[numeric_columns].apply(lambda col: col.interpolate(method='linear'))

# Scale Numeric Data
scaler = StandardScaler()
data_cleaned[numeric_columns] = scaler.fit_transform(data_cleaned[numeric_columns])
data_cleaned[numeric_columns] = scaler.transform(data_cleaned[numeric_columns])

# Impute Categorical Missing Data Using Mode Imputation
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].astype('category')
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].apply(lambda col: col.fillna(col.mode()[0]))

data_cleaned.head()

Unnamed: 0,host_identity_verified,neighbourhood_group,neighbourhood,instant_bookable,cancellation_policy,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,days_since_last_review,years_since_construction
0,unconfirmed,Brooklyn,Kensington,False,strict,Private room,-1.88233,-0.264606,-0.562485,-1.229199,-2.115255,-0.247978,-1.036218,-1.095235,-2.223273
1,verified,Manhattan,Midtown,False,moderate,Entire home/apt,-1.889826,-0.243227,-0.547787,-1.169528,-2.115255,-0.251843,-1.039385,-1.09526,-1.831865
2,unconfirmed,Manhattan,Harlem,True,flexible,Private room,-1.885478,-0.272089,-0.566159,-0.421876,-1.508574,-0.252809,-1.032615,-1.094215,-1.771649
3,unconfirmed,Brooklyn,Clinton Hill,True,moderate,Entire home/apt,-1.88777,-0.243227,-0.455925,0.325776,-2.115255,-0.252809,-1.034253,-1.095139,-1.771649
4,verified,Manhattan,East Harlem,False,moderate,Entire home/apt,-1.889262,-0.264606,-0.562485,-1.26781,-2.721935,-0.252809,-1.036054,-1.095113,-1.892082


In [4]:
data_cleaned.dtypes

host_identity_verified            category
neighbourhood_group               category
neighbourhood                     category
instant_bookable                  category
cancellation_policy               category
room_type                         category
price                              float64
minimum_nights                     float64
number_of_reviews                  float64
reviews_per_month                  float64
review_rate_number                 float64
calculated_host_listings_count     float64
availability_365                   float64
days_since_last_review             float64
years_since_construction           float64
dtype: object

##### Notes for Harshita

- data_cleaned: Data with all features without encoding, and response (price). Can be used for EDA and Outlier Detection
- data_encoded: OneHotEncoded data. Not meaningful for outlier detection, EDA, or feature selection. Purely for model inputs. This step needs to be done after EDA, outlier detection, and feature selection is completed.
- I also removed useless features such as 'house_rules', 'country', 'country_code', 'lat', 'long' as country and country code for the entire dataset were United States. I removed lat and long as they are raw coordinates that have no significance without context. I removed house rules as it is pure paragraphical text data which we cannot process for a forecasting task. I removed service fee as it is already included in the price (100% correlation to response).

### EDA - Harshita

### Outlier Detection - Harshita

### Train Test Split - Krishna

In [5]:
# OneHotEncoding for Categorical Variables for Model Compatibility
data_encoded = pd.get_dummies(data_cleaned, drop_first=True)

# Store Features in X
X = data_encoded.drop(['price'], axis=1)

# Store Response Variables in y
y = data_encoded['price']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Selection - Krishna

In [13]:
# Mutual Information Initialization
mi_scores = mutual_info_regression(X_train, y_train)
mi_scores_series = pd.Series(mi_scores, index=X_train.columns)

# Thresholds to test
thresholds = [0.001, 0.005, 0.01]

# Initialize variables to track the best thresholds and scores
best_mi_t = 0
best_mi_score = 0
best_xgb_t = 0
best_xgb_score = 0
best_features_mi = []
best_features_xgb = []

# Loop through thresholds for both MI and XGBoost feature selection
for t in thresholds:
    print(f'Threshold: {t}')
    print('-' * 30)
    
    # Mutual Information Feature Selection
    mi_selected_features = mi_scores_series[mi_scores_series > t].index
    X_train_mi = X_train[mi_selected_features]
    print(f"MI Features Selected: {len(mi_selected_features)}")

    # Evaluate using cross-validation after MI selection
    mi_scores = cross_val_score(XGBRegressor(random_state=42), X_train_mi, y_train, cv=5, scoring='r2')
    mi_mean_score = mi_scores.mean()

    # Update best MI threshold and score and save features
    if mi_mean_score > best_mi_score:
        best_mi_score = mi_mean_score
        best_mi_t = t
        best_features_mi = mi_selected_features.tolist()

    print(f"MI Mean CV Score: {mi_mean_score:.4f}")

    # XGBoost Feature Importance Refinement
    xgb_model = XGBRegressor(random_state=42)
    xgb_model.fit(X_train_mi, y_train)
    xgb_importances = pd.Series(xgb_model.feature_importances_, index=X_train_mi.columns)

    # Apply XGBoost threshold to further refine features
    xgb_selected_features = xgb_importances[xgb_importances > t].index
    X_train_xgb = X_train_mi[xgb_selected_features]
    print(f"XGBoost Features Selected: {len(xgb_selected_features)}")

    # Evaluate using cross-validation after XGBoost refinement
    xgb_scores = cross_val_score(XGBRegressor(random_state=42), X_train_xgb, y_train, cv=5, scoring='r2')
    xgb_mean_score = xgb_scores.mean()

    # Update best XGBoost threshold and score and save features
    if xgb_mean_score > best_xgb_score:
        best_xgb_score = xgb_mean_score
        best_xgb_t = t
        best_features_xgb = xgb_selected_features.tolist()

    print(f"XGBoost Mean CV Score: {xgb_mean_score:.4f}")
    print('-' * 30)

# Final Results
print(f"Best MI Threshold: {best_mi_t}, Best MI Mean CV Score: {best_mi_score:.4f}")
print(f"Best XGBoost Threshold: {best_xgb_t}, Best XGBoost Mean CV Score: {best_xgb_score:.4f}")

Threshold: 0.001
------------------------------
MI Features Selected: 80
MI Mean CV Score: 0.0324
XGBoost Features Selected: 80
XGBoost Mean CV Score: 0.0324
------------------------------
Threshold: 0.005
------------------------------
MI Features Selected: 19
MI Mean CV Score: 0.0333
XGBoost Features Selected: 19
XGBoost Mean CV Score: 0.0333
------------------------------
Threshold: 0.01
------------------------------
MI Features Selected: 8
MI Mean CV Score: 0.0277
XGBoost Features Selected: 8
XGBoost Mean CV Score: 0.0277
------------------------------
Best MI Threshold: 0.005, Best MI Mean CV Score: 0.0333
Best XGBoost Threshold: 0.005, Best XGBoost Mean CV Score: 0.0333


In [14]:
# Select Best Features from Training and Test Sets
X_train_final = X_train[best_features_xgb]
X_test_final = X_test[best_features_xgb]

print("\nFeatures selected by Mutual Information and XGBoost:\n" + "\n".join(best_features_xgb))


Features selected by Mutual Information and XGBoost:
minimum_nights
number_of_reviews
reviews_per_month
review_rate_number
calculated_host_listings_count
availability_365
days_since_last_review
years_since_construction
neighbourhood_group_Manhattan
neighbourhood_group_Queens
neighbourhood_Astoria
neighbourhood_Bedford-Stuyvesant
neighbourhood_Crown Heights
neighbourhood_East Village
neighbourhood_Harlem
neighbourhood_Midtown
neighbourhood_Upper East Side
neighbourhood_Upper West Side
room_type_Private room


In [18]:
X_train_final.shape, X_test_final.shape

((82079, 19), (20520, 19))

In [20]:
X_train_final.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,days_since_last_review,years_since_construction,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_Astoria,neighbourhood_Bedford-Stuyvesant,neighbourhood_Crown Heights,neighbourhood_East Village,neighbourhood_Harlem,neighbourhood_Midtown,neighbourhood_Upper East Side,neighbourhood_Upper West Side,room_type_Private room
85201,-0.274227,-0.530639,-0.502608,-3.935296,-0.249911,-1.050303,-1.095139,-1.982407,1,0,0,0,0,0,0,0,0,0,0
87221,-0.272089,-0.566159,-0.72901,-2.115255,-0.252809,-1.051831,-1.094215,-2.10284,1,0,0,0,0,0,0,0,1,0,0
16509,-0.269951,-0.564934,-1.274831,-3.935296,-0.251843,-1.033052,-1.095139,-1.982407,0,0,0,0,0,0,0,0,0,0,0
51206,-0.273158,-0.560035,-0.242861,-3.328615,-0.252809,-1.046263,-1.09525,-2.283489,0,0,0,0,0,0,0,0,0,0,1
20055,-0.273158,-0.564526,-1.25728,-2.115255,-0.252809,-1.051831,-1.095092,-2.012515,0,0,0,0,0,0,0,0,0,0,0


In [21]:
X_test_final.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,days_since_last_review,years_since_construction,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_Astoria,neighbourhood_Bedford-Stuyvesant,neighbourhood_Crown Heights,neighbourhood_East Village,neighbourhood_Harlem,neighbourhood_Midtown,neighbourhood_Upper East Side,neighbourhood_Upper West Side,room_type_Private room
48202,-0.253917,-0.566159,-0.951901,-2.115255,-0.252809,-1.050194,-1.094215,-2.253381,1,0,0,0,0,0,0,0,0,0,0
94193,-0.273158,-0.54942,-0.323593,-1.508574,-0.248944,-1.042605,-1.095135,-2.253381,1,0,0,0,0,0,0,0,0,0,0
90387,-0.211158,-0.560852,-1.067735,-2.721935,-0.249911,-1.051831,-1.095102,-1.92219,1,0,0,0,0,0,1,0,0,0,1
89198,-0.272089,-0.565343,-1.278341,-1.508574,-0.252809,-1.032015,-1.09505,-1.801757,0,0,0,0,0,0,0,0,0,0,1
8266,-0.273158,-0.546562,-0.962432,-2.115255,-0.252809,-1.039658,-1.095139,-2.072732,1,0,0,0,0,0,0,0,0,0,1


## Linear Regression - Harshita

#### Model Evaluation

## Random Forest - Krishna

#### Model Evaluation

## XGBoost - Krishna

#### Model Evaluation

## Meta Model

#### Model Evaluation