In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix, mean_squared_error
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time

from xgboost import XGBRegressor, XGBClassifier

In [2]:
data_train = pd.read_csv('train_regression.csv')

# Clean and convert price
data_train.price = data_train.price.str.replace('$', '').str.replace(',','').astype(float)

# get rid of unrealistic value
data_train = data_train[data_train.price < 10000]

# Response rate
data_train.host_response_rate = data_train.host_response_rate.str.replace('%','').astype(float)

# Host acceptance rate
data_train.host_acceptance_rate = data_train.host_acceptance_rate.str.replace('%','').astype(float)

# Imputing numeric

data_train = data_train.fillna(data_train.median())

# apply everythingnto the test data
data_test = pd.read_csv('test_regression.csv')
data_test.host_response_rate = data_test.host_response_rate.str.replace('%','').astype(float)
data_test.host_acceptance_rate = data_test.host_acceptance_rate.str.replace('%','').astype(float)
data_test = data_test.fillna(data_train.median())

# Host Total Listings Count
data_train['host_total_listings_count'] = data_train['host_total_listings_count'].apply(lambda count: 1 if count <= 7 else 0)
data_test['host_total_listings_count'] = data_test['host_total_listings_count'].apply(lambda count: 1 if count <= 7 else 0)

# Host Response Time
data_train['host_response_time'] = data_train['host_response_time'].apply(lambda time: 1 if time in ['within an hour', 'within a few hours'] else 0)
data_test['host_response_time'] = data_test['host_response_time'].apply(lambda time: 1 if time in ['within an hour', 'within a few hours'] else 0)

# Room Type
data_train['room_type'] = data_train['room_type'].apply(lambda room: 1 if room in ['Private room', 'Entire home/apt', 'Hotel room'] else 0)
data_test['room_type'] = data_test['room_type'].apply(lambda room: 1 if room in ['Private room', 'Entire home/apt', 'Hotel room'] else 0)

# Property Type
data_train['property_type'] = data_train['property_type'].apply(lambda location: 1 if location in ['Shared room in home', 'Shared room in bungalow', 'Shared room in hostel', 'Shared room in rental unit', 'Shared room in condo'] else 0)
data_test['property_type'] = data_test['property_type'].apply(lambda location: 1 if location in ['Shared room in home', 'Shared room in bungalow', 'Shared room in hostel', 'Shared room in rental unit', 'Shared room in condo'] else 0)

# Host Neighbourhood
data_train['host_neighbourhood'] = data_train['host_neighbourhood'].apply(lambda neighborhood: 1 if neighborhood in ['Lakeview', 'Mount Greenwood', 'Gold Coast', 'Chicago Loop', 'Edison Park'] else 0)
data_test['host_neighbourhood'] = data_test['host_neighbourhood'].apply(lambda neighborhood: 1 if neighborhood in ['Lakeview', 'Mount Greenwood', 'Gold Coast', 'Chicago Loop', 'Edison Park'] else 0)

# Bathrooms Text
data_train['bathrooms_text'] = data_train['bathrooms_text'].str.extract(r'(\d+)').astype(float)
data_test['bathrooms_text'] = data_test['bathrooms_text'].str.extract(r'(\d+)').astype(float)
data_train['bathrooms_text'] = data_train['bathrooms_text'].fillna(data_train['bathrooms_text'].median())
data_test['bathrooms_text'] = data_test['bathrooms_text'].fillna(data_test['bathrooms_text'].median())


# Convert float values to string
data_train['host_location'] = data_train['host_location'].astype(str)
data_test['host_location'] = data_test['host_location'].astype(str)

# City list
city_list = ['Chicago, IL', 'Lakeview', 'Mount Greenwood', 'Gold Coast', 'Chicago Loop', 'Edison Park']

# Map locations to 'West' (1), 'East' (0), and NaN (0)
data_train['host_location'] = data_train['host_location'].apply(lambda location: 1 if any(city in location for city in city_list) else (0 if location == 'nan' else 0))
data_test['host_location'] = data_test['host_location'].apply(lambda location: 1 if any(city in location for city in city_list) else (0 if location == 'nan' else 0))

# Map 't' to 1 and 'f' to 0 for binary columns
data_train['host_is_superhost'] = data_train['host_is_superhost'].map({'t': 1, 'f': 0})
data_test['host_is_superhost'] = data_test['host_is_superhost'].map({'t': 1, 'f': 0})

data_train['host_has_profile_pic'] = data_train['host_has_profile_pic'].map({'t': 1, 'f': 0})
data_test['host_has_profile_pic'] = data_test['host_has_profile_pic'].map({'t': 1, 'f': 0})

data_train['host_identity_verified'] = data_train['host_identity_verified'].map({'t': 1, 'f': 0})
data_test['host_identity_verified'] = data_test['host_identity_verified'].map({'t': 1, 'f': 0})

data_train['has_availability'] = data_train['has_availability'].map({'t': 1, 'f': 0})
data_test['has_availability'] = data_test['has_availability'].map({'t': 1, 'f': 0})

data_train['instant_bookable'] = data_train['instant_bookable'].map({'t': 1, 'f': 0})
data_test['instant_bookable'] = data_test['instant_bookable'].map({'t': 1, 'f': 0})

# Extract email or phone verification from host_verifications
data_train['host_verifications'] = data_train['host_verifications'].apply(lambda verifications: 1 if 'email' in verifications or 'phone' in verifications else 0)
data_test['host_verifications'] = data_test['host_verifications'].apply(lambda verifications: 1 if 'email' in verifications or 'phone' in verifications else 0)

# Map selected neighborhoods to 1 and others to 0
neighborhoods_to_map_to_1 = ['Lake View', 'Lincoln Park', 'Near North Side', 'West Town', 'Logan Square']
data_train['neighbourhood_cleansed'] = data_train['neighbourhood_cleansed'].apply(lambda neighborhood: 1 if neighborhood in neighborhoods_to_map_to_1 else 0)
data_test['neighbourhood_cleansed'] = data_test['neighbourhood_cleansed'].apply(lambda neighborhood: 1 if neighborhood in neighborhoods_to_map_to_1 else 0)

# Drop specified columns
columns_to_exclude = ['host_id', 'host_since', 'first_review', 'last_review']
data_train = data_train.drop(columns=columns_to_exclude, errors='ignore')
data_test = data_test.drop(columns=columns_to_exclude, errors='ignore')

# impute
data_train = data_train.fillna(data_train.median())
data_test = data_test.fillna(data_test.median())

# select predictors and response
X_train = data_train.drop(columns='price')
y_train = data_train['price']

X_test = data_test

  data_train.price = data_train.price.str.replace('$', '').str.replace(',','').astype(float)
  data_train = data_train.fillna(data_train.median())
  data_test = data_test.fillna(data_train.median())


In [None]:
model = XGBRegressor(random_state = 12,
                    objective = 'reg:squarederror')

grid = {
            'n_estimators':[100,500,1000],
            'max_depth':[4,6,8],
            'learning_rate':[0.001,0.01,0.1], # Different orders of magnitude
            'subsample': [0.5,0.75,1.0], # Different floats between 0.5 and 1
            # XGBoost hyperparams
            'reg_lambda':[0.01,0.1,1], # Try different orders of magnitude - maybe 0.001
            'gamma': [0.01,0.1,1]
}

gscv = GridSearchCV(model, grid, cv=3, scoring = 'neg_root_mean_squared_error')

gscv.fit(X_train, y_train)

In [None]:
gscv.best_params_

In [None]:
# Make predictions on the test data

preds = gscv.predict(X_test)

# Create a DataFrame with predictions and save it to a CSV file
output = pd.DataFrame({'id': data_test.id, 'predicted': preds})
output.to_csv('XGBoost_regression_submission.csv', index=False)