In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np


In [2]:
df = pd.read_csv("listings.csv")

df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,39572,https://www.airbnb.com/rooms/39572,20250103033441,2025-01-03,city scrape,1 br in a 2 br apt (Midtown West),,,https://a0.muscache.com/pictures/fd1bffd9-ccf8...,169927,...,5.0,4.98,4.86,,f,2,1,1,0,0.25
1,39593,https://www.airbnb.com/rooms/39593,20250103033441,2025-01-03,city scrape,A lovely room w/ a Manhattan view,"A private, furnished large room to rent Jan/F...","Nate Silver called this super safe, clean, qui...",https://a0.muscache.com/pictures/0b9110f7-3b24...,110506,...,4.96,4.79,4.93,,f,1,0,1,0,0.2
2,39704,https://www.airbnb.com/rooms/39704,20250103033441,2025-01-03,previous scrape,"Private, Large & Sunny 1BR w/W&D",It's a No Brainer:<br />•Terrific Space For Le...,The Neighborhood<br />• Rich History <br />• B...,https://a0.muscache.com/pictures/0bc4e8a4-c047...,170510,...,4.92,4.38,4.72,,f,2,2,0,0,1.93
3,42300,https://www.airbnb.com/rooms/42300,20250103033441,2025-01-03,city scrape,Beautiful Lower East Side Loft,Architect-owned loft is a corner unit in a bea...,"The apartment is in the border of Soho, LES an...",https://a0.muscache.com/pictures/0e285e13-ee14...,184755,...,4.87,4.57,4.62,,f,1,1,0,0,0.4
4,42729,https://www.airbnb.com/rooms/42729,20250103033441,2025-01-03,city scrape,@HouseOnHenrySt - Private 2nd bedroom w/shared...,,"Lovely old Brooklyn neighborhood, with brick/b...",https://a0.muscache.com/pictures/925fe213-f5e1...,11481,...,4.73,4.58,4.64,,f,4,1,3,0,1.26


In [3]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [4]:
pd.set_option('display.max_columns', None)

print(df.dtypes)

id                                                int64
listing_url                                      object
scrape_id                                         int64
last_scraped                                     object
source                                           object
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 75, dtype: object


In [5]:
missing_values = df.isnull().sum()
print(missing_values[missing_values >0])

name                               2
description                      973
neighborhood_overview          17177
picture_url                        1
host_name                          6
host_since                         6
host_location                   8042
host_about                     16172
host_response_time             15585
host_response_rate             15585
host_acceptance_rate           14872
host_is_superhost                421
host_thumbnail_url                 6
host_picture_url                   6
host_neighbourhood              7579
host_listings_count                6
host_total_listings_count          6
host_verifications                 6
host_has_profile_pic               6
host_identity_verified             6
neighbourhood                  17176
bathrooms                      14799
bathrooms_text                    32
bedrooms                        5809
beds                           14934
price                          14815
minimum_minimum_nights             2
m

In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Fill numerical columns with median
numerical_col = ['bathrooms', 'bedrooms', 'beds', 'reviews_per_month']
for col in numerical_col:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with the most frequent value
categorical_col = ['neighbourhood', 'host_listings_count']
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_col] = cat_imputer.fit_transform(df[categorical_col])

# Replace other non-important categorical columns with 'Unknown'
other_col = ['host_name', 'host_location', 'host_about', 'host_response_time', 'host_is_superhost']
for col in other_col:
    df[col] = df[col].fillna('Unknown')

# Drop unnecessary columns that are text-heavy
drop_col = ['description', 'neighborhood_overview', 'picture_url',
            'host_thumbnail_url', 'host_picture_url', 'calendar_updated',
            'first_review', 'last_review', 'license',
            'host_name', 'host_location', 'host_about']
df.drop(columns=drop_col, inplace=True)

# Label Encoding for high-cardinality categorical columns
exclude_cols = ['price', 'listing_url', 'name']
cat_cols_remaining = [col for col in df.select_dtypes(include='object').columns if col not in exclude_cols]
#cat_cols_remaining.remove('price')  # Don't touch price column

le = LabelEncoder()
for col in cat_cols_remaining:
    df[col] = le.fit_transform(df[col])

# Now, check memory usage and dataframe info
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 63 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            37784 non-null  int64  
 1   listing_url                                   37784 non-null  object 
 2   scrape_id                                     37784 non-null  int64  
 3   last_scraped                                  37784 non-null  int64  
 4   source                                        37784 non-null  int64  
 5   name                                          37782 non-null  object 
 6   host_id                                       37784 non-null  int64  
 7   host_url                                      37784 non-null  int64  
 8   host_since                                    37784 non-null  int64  
 9   host_response_time                            37784 non-null 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [7]:
print(df.dtypes)

id                                                int64
listing_url                                      object
scrape_id                                         int64
last_scraped                                      int64
source                                            int64
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 63, dtype: object


In [8]:
# Clean the 'price' column to remove dollar signs and commas, then convert to float
df['price'] = df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Verify the transformation
print(df['price'].head())

0    139.0
1     78.0
2      NaN
3    750.0
4    185.0
Name: price, dtype: float64


In [9]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

df['neighbourhood']=le.fit_transform(df['neighbourhood'])

df['bedroom_bathroom_interaction'] = df['bedrooms'] * df['bathrooms']
df['bedroom_bathroom_beds_interaction'] = df['bedrooms'] * df['bathrooms'] * df['beds']
df['bed_bath_ratio'] = df['bedrooms'] / (df['bathrooms'] + 1)  # Adding 1 to avoid division by 0
df['bed_bedrooms_ratio'] = df['beds'] / (df['bedrooms'] + 1)  # Adding 1 to avoid division by 0

# Select the features you want to include for training (including interaction terms)
features = ['bathrooms', 'bedrooms', 'beds', 'reviews_per_month', 'host_listings_count', 
            'bedroom_bathroom_interaction', 'bedroom_bathroom_beds_interaction', 
            'bed_bath_ratio', 'bed_bedrooms_ratio', 'neighbourhood']

target='price'

In [10]:
X=df[features]
y=df[target]

print(X.isnull().sum())
print(X.dtypes)

bathrooms                            0
bedrooms                             0
beds                                 0
reviews_per_month                    0
host_listings_count                  0
bedroom_bathroom_interaction         0
bedroom_bathroom_beds_interaction    0
bed_bath_ratio                       0
bed_bedrooms_ratio                   0
neighbourhood                        0
dtype: int64
bathrooms                            float64
bedrooms                             float64
beds                                 float64
reviews_per_month                    float64
host_listings_count                    int64
bedroom_bathroom_interaction         float64
bedroom_bathroom_beds_interaction    float64
bed_bath_ratio                       float64
bed_bedrooms_ratio                   float64
neighbourhood                          int64
dtype: object


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.isnull().sum())
print(X_val.isnull().sum())
print(X_train.dtypes)

bathrooms                            0
bedrooms                             0
beds                                 0
reviews_per_month                    0
host_listings_count                  0
bedroom_bathroom_interaction         0
bedroom_bathroom_beds_interaction    0
bed_bath_ratio                       0
bed_bedrooms_ratio                   0
neighbourhood                        0
dtype: int64
bathrooms                            0
bedrooms                             0
beds                                 0
reviews_per_month                    0
host_listings_count                  0
bedroom_bathroom_interaction         0
bedroom_bathroom_beds_interaction    0
bed_bath_ratio                       0
bed_bedrooms_ratio                   0
neighbourhood                        0
dtype: int64
bathrooms                            float64
bedrooms                             float64
beds                                 float64
reviews_per_month                    float64
host_listings_

In [12]:
X_train = X_train.astype(float)
X_val = X_val.astype(float)
y_train = y_train.astype(float)
y_val = y_val.astype(float)

In [13]:
print(y_train.shape)

(30227,)


In [14]:
# Convert X_train and X_val to NumPy arrays
X_train_np = X_train.to_numpy()
X_val_np = X_val.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()



In [15]:
print(X_train_np.shape)  # Should be (30227, n_features)
print(X_val_np.shape)  

(30227, 10)
(7557, 10)


In [16]:
import numpy as np

# Check for NaN values in the target variable
print(np.isnan(y_train).sum())
print(np.isnan(y_val).sum())


11845
2970


In [17]:
# Replace NaN values in target with the mean of the respective array
mean_y_train = np.nanmean(y_train)
mean_y_val = np.nanmean(y_val)

y_train = np.nan_to_num(y_train, nan=mean_y_train)
y_val = np.nan_to_num(y_val, nan=mean_y_val)

In [18]:
dtrain = xgb.DMatrix(X_train_np, label=y_train)
dval = xgb.DMatrix(X_val_np, label=y_val)

# Check if the DMatrix is created successfully
print(f"DMatrix for training: {dtrain.num_row()} rows, {dtrain.num_col()} columns.")
print(f"DMatrix for validation: {dval.num_row()} rows, {dval.num_col()} columns.")

DMatrix for training: 30227 rows, 10 columns.
DMatrix for validation: 7557 rows, 10 columns.


In [19]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# Initialize parameters
params = {
    'n_estimators': 1000, 
    'learning_rate': 0.05, 
    'max_depth': 6, 
    'random_state': 42,
    'eval_metric': 'mae'  # Explicitly setting eval_metric in the params
}

# Initialize the model with params
model = xgb.XGBRegressor(**params)

# Train the model with eval_set
model.fit(X_train, y_train, 
          eval_set=[(X_val, y_val)], 
          verbose=False)

# Predict on the validation set
y_pred = model.predict(X_val)

# Evaluate performance (Mean Absolute Error)
error = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {error}')




Mean Absolute Error: 68.00847099557942


In [20]:
X_missing = df[df['price'].isna()][X_train.columns]

#X_missing_np = X_missing.to_numpy()
predicted_prices = model.predict(X_missing)

df_new = df

df_new.loc[df_new['price'].isna(), 'price']=predicted_prices

In [21]:
print(df['price'].isna().sum())  # Should be 0


0


In [22]:
df_new.to_csv('cleaned_data_nprices.csv', index=False)
print("file saved!")

file saved!
