In [67]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
#set max displayed columns to none
pd.set_option('display.max_columns', None)

In [68]:
short_df = pd.read_csv('../data/training_set.csv')
test_set = pd.read_csv('../data/test_set.csv')

In [69]:
# Remove the dollar sign and convert the price column to float
short_df['price'] = short_df['price'].replace('[\$,]', '', regex=True).astype(float)
test_set['price'] = test_set['price'].replace('[\$,]', '', regex=True).astype(float)

In [70]:
short_df.loc[(short_df['availability_30'] == 30) | (short_df['availability_30'] == 0)].count()

id                            13253
name                          13253
host_id                       13253
host_since                    13253
host_is_superhost             13253
                              ...  
has_years                     13253
has_babysitter                13253
has_recommendations           13253
imputed_review_score          13253
distance_to_closest_subway    13253
Length: 219, dtype: int64

In [71]:
short_df.loc[(short_df['availability_365'] == 365) | (short_df['availability_365'] == 0)].count()

id                            10262
name                          10262
host_id                       10262
host_since                    10262
host_is_superhost             10262
                              ...  
has_years                     10262
has_babysitter                10262
has_recommendations           10262
imputed_review_score          10262
distance_to_closest_subway    10262
Length: 219, dtype: int64

In [72]:
property_distribution = test_set['room_type'].value_counts()


property_distribution


room_type
Entire home/apt    6655
Private room       5194
Shared room         156
Name: count, dtype: int64

In [73]:
property_distribution_train = short_df['room_type'].value_counts()

property_distribution_train

room_type
Entire home/apt    10238
Private room        7926
Shared room          259
Hotel room            24
Name: count, dtype: int64

In [74]:
# Step 1: Categorize 'accommodates'
bins_accommodates = [0, 2, 5, float('inf')]  # Define the bins for 1-2, 3-5, 6+
labels_accommodates = ['1-2', '3-5', '6+']
short_df['accommodates_group'] = pd.cut(short_df['accommodates'], bins=bins_accommodates, labels=labels_accommodates, right=False)

# Create a list to store outliers
outlier_rows = []

# Step 2 & 3: Create the boxplots and find outliers
# Get unique combinations of 'neighbourhood_group_cleansed', 'room_type', and 'accommodates_group'
for name, group in short_df.groupby(['neighbourhood_group_cleansed', 'room_type', 'accommodates_group']):
    
    # Plot
    #plt.figure()
    #boxplot = sns.boxplot(data=group, x='price')
    #plt.title(f'Neighbourhood: {name[0]}, Room Type: {name[1]}, Accommodates: {name[2]}')
    #plt.xlabel('Price')
    #plt.show()
    
    # Find outliers
    Q1 = group['price'].quantile(0.25)
    Q3 = group['price'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)
    
    # Filter the group for outliers
    outliers = group[(group['price'] < lower_bound) | (group['price'] > upper_bound)]
    outlier_rows.append(outliers)
    
# Step 4: Concatenate the list of outlier dataframes
outlier_df = pd.concat(outlier_rows, ignore_index=True)

# Print the outlier data
#outlier_df

  for name, group in short_df.groupby(['neighbourhood_group_cleansed', 'room_type', 'accommodates_group']):


In [75]:
outlier_ids = outlier_df["id"].unique().tolist()

In [76]:
# Remove the outliers from the dataframe short_df, using outlier_ids
short_df = short_df[~short_df["id"].isin(outlier_ids)]


In [77]:
short_df.shape

(17251, 220)

In [78]:
#removing rows from short_df where 'room_type' is Hotel room 
short_df = short_df[short_df.room_type != 'Hotel room']


In [79]:
short_df.shape

(17227, 220)

In [80]:
# one hot encoding for room_type column, for both short_df and test_set
short_df = pd.get_dummies(short_df, columns=['room_type'])
test_set = pd.get_dummies(test_set, columns=['room_type'])

# drop room_type_Shared room column in both short_df and test_set
short_df.drop(['room_type_Shared room'], axis=1, inplace=True)
test_set.drop(['room_type_Shared room'], axis=1, inplace=True)


In [81]:
# Dropping name column from both sets
short_df.drop(['name'], axis=1, inplace=True)
test_set.drop(['name'], axis=1, inplace=True)


# dropping these columns has_availability	availability_30	availability_60	availability_90	availability_365	from both sets
short_df.drop(['has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365'], axis=1, inplace=True)
test_set.drop(['has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365'], axis=1, inplace=True)

In [82]:
# Remove ID, longitude and latitude columns
short_df.drop(['id', 'longitude', 'latitude'], axis=1, inplace=True)
test_set.drop(['id', 'longitude', 'latitude'], axis=1, inplace=True)

In [83]:
# Drop all columns in listings which starts with "is_", "has_", "type_"
short_df.drop(short_df.filter(regex='^is_'), axis=1, inplace=True)
short_df.drop(short_df.filter(regex='^has_'), axis=1, inplace=True)
short_df.drop(short_df.filter(regex='^type_'), axis=1, inplace=True)

test_set.drop(test_set.filter(regex='^is_'), axis=1, inplace=True)
test_set.drop(test_set.filter(regex='^has_'), axis=1, inplace=True)
test_set.drop(test_set.filter(regex='^type_'), axis=1, inplace=True)


In [84]:
# Drop accomodate_group column
short_df.drop(['accommodates_group'], axis=1, inplace=True)

In [85]:
# Convert instant bookable to binary
short_df['instant_bookable'] = short_df['instant_bookable'].replace(['t', 'f'], [1, 0])
test_set['instant_bookable'] = test_set['instant_bookable'].replace(['t', 'f'], [1, 0])

In [86]:
# Remove first_review and last_review columns
short_df.drop(['first_review', 'last_review'], axis=1, inplace=True)
test_set.drop(['first_review', 'last_review'], axis=1, inplace=True)


In [87]:
# Drop host_position and host_id
short_df.drop(['host_position'], axis=1, inplace=True)
short_df.drop(['host_id'], axis=1, inplace=True)
test_set.drop(['host_position'], axis=1, inplace=True)
test_set.drop(['host_id'], axis=1, inplace=True)

In [88]:
# Convert host_since from datetime to just year as a float
short_df['host_since'] = pd.to_datetime(short_df['host_since']).dt.year
test_set['host_since'] = pd.to_datetime(test_set['host_since']).dt.year


In [89]:
# remove neighbourhood_cleansed
short_df.drop(['neighbourhood_cleansed'], axis=1, inplace=True)
test_set.drop(['neighbourhood_cleansed'], axis=1, inplace=True)

In [90]:
# One hot encode neighbourhood_group_cleansed
short_df = pd.get_dummies(short_df, columns=['neighbourhood_group_cleansed'])
test_set = pd.get_dummies(test_set, columns=['neighbourhood_group_cleansed'])

In [91]:
# Drop neighbourhood_group_cleansed_Staten Island
short_df.drop(['neighbourhood_group_cleansed_Staten Island'], axis=1, inplace=True)
test_set.drop(['neighbourhood_group_cleansed_Staten Island'], axis=1, inplace=True)

In [92]:
# Drop minimum_nights
short_df.drop(['minimum_nights'], axis=1, inplace=True)
test_set.drop(['minimum_nights'], axis=1, inplace=True)

In [93]:
# log2 transform price
short_df['price'] = short_df['price'].apply(lambda x: math.log2(x))

In [94]:
property_types_prices = {
    "entire bed and breakfast": 8.810572,
    "entire cottage": 7.965784,
    "entire serviced apartment": 7.912496,
    "cave": 7.781360,
    "entire vacation home": 7.650113,
    "entire townhouse": 7.570194,
    "entire loft": 7.569125,
    "entire condo": 7.474456,
    "entire home": 7.419369,
    "shared room in guesthouse": 7.321928,
    "entire rental unit": 7.280364,
    "shared room in bed and breakfast": 7.228819,
    "entire place": 7.201999,
    "entire homeapt": 6.942807,
    "floor": 6.891919,
    "room in hotel": 6.805065,
    "private room in vacation home": 6.800189,
    "entire guest suite": 6.738924,
    "private room in hostel": 6.735012,
    "entire guesthouse": 6.623971,
    "tiny home": 6.554589,
    "camperrv": 6.414861,
    "private room in guest suite": 6.344440,
    "private room": 6.182770,
    "private room in houseboat": 6.169925,
    "private room in condo": 6.158579,
    "room in boutique hotel": 6.152101,
    "private room in bed and breakfast": 6.130867,
    "private room in farm stay": 6.129283,
    "private room in guesthouse": 6.039025,
    "private room in rental unit": 6.018433,
    "private room in earthen home": 6.005089,
    "private room in loft": 5.972396,
    "shared room in condo": 5.964646,
    "private room in casa particular": 5.898668,
    "shared room in townhouse": 5.880752,
    "private room in home": 5.814843,
    "shared room in rental unit": 5.812383,
    "private room in bungalow": 5.700440,
    "shared room in loft": 5.653483,
    "private room in villa": 5.632846,
    "private room in townhouse": 5.512646,
    "private room in serviced apartment": 5.491653,
    "shared room in home": 5.466952,
    "shared room in serviced apartment": 5.413908,
    "private room in inlaw": 5.321928,
    "private room in floor": 5.321928,
    "private room in dorm": 5.247928,
    "private room in train": 4.906891,
    "shared room in bungalow": 4.807355,
}

# Sort the property types by their mean price in ascending order
sorted_property_types = sorted(property_types_prices, key=property_types_prices.get)

# Encode the property types with integers starting from 1
property_type_encoding = {property_type: index + 1 for index, property_type in enumerate(sorted_property_types)}

property_type_encoding

{'shared room in bungalow': 1,
 'private room in train': 2,
 'private room in dorm': 3,
 'private room in inlaw': 4,
 'private room in floor': 5,
 'shared room in serviced apartment': 6,
 'shared room in home': 7,
 'private room in serviced apartment': 8,
 'private room in townhouse': 9,
 'private room in villa': 10,
 'shared room in loft': 11,
 'private room in bungalow': 12,
 'shared room in rental unit': 13,
 'private room in home': 14,
 'shared room in townhouse': 15,
 'private room in casa particular': 16,
 'shared room in condo': 17,
 'private room in loft': 18,
 'private room in earthen home': 19,
 'private room in rental unit': 20,
 'private room in guesthouse': 21,
 'private room in farm stay': 22,
 'private room in bed and breakfast': 23,
 'room in boutique hotel': 24,
 'private room in condo': 25,
 'private room in houseboat': 26,
 'private room': 27,
 'private room in guest suite': 28,
 'camperrv': 29,
 'tiny home': 30,
 'entire guesthouse': 31,
 'private room in hostel': 3

In [95]:
encoding_series = pd.Series(property_type_encoding)

# Replace the 'property_type' in both DataFrames with the encoding
short_df['property_type'] = short_df['property_type'].map(encoding_series)
test_set['property_type'] = test_set['property_type'].map(encoding_series).fillna(0)


In [96]:
short_df

Unnamed: 0,host_since,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,property_type,accommodates,bedrooms,beds,price,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,shared_bathroom,num_baths,imputed_review_score,distance_to_closest_subway,room_type_Entire home/apt,room_type_Private room,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens
0,2013,0,1.0,1.0,1,1,42,3,1.0,1.0,7.000000,1125,3,1,0,5.00,0,1,1,0,0,0.16,0,1.5,0,1.159658,True,False,False,False,False,True
1,2023,0,1.0,1.0,1,1,40,2,1.0,1.0,8.228819,365,1,1,1,5.00,0,1,1,0,0,1.00,0,1.0,0,0.073081,True,False,False,False,True,False
2,2013,0,1.0,4.0,1,0,20,2,1.0,1.0,6.129283,1125,37,0,0,4.53,0,1,0,1,0,0.43,1,1.0,0,1.039192,False,True,False,True,False,False
3,2014,0,304.0,336.0,1,1,9,2,1.0,1.0,5.700440,1125,1,0,0,5.00,0,247,9,238,0,0.02,0,1.0,0,0.400792,False,True,False,False,False,True
4,2014,0,304.0,336.0,1,1,9,2,1.0,1.0,4.700440,1125,1,0,0,1.00,0,247,9,238,0,0.03,1,1.0,0,0.443224,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18442,2020,0,3.0,3.0,1,0,33,7,5.0,7.0,7.076816,365,0,0,0,4.83,0,3,2,1,0,0.00,0,1.5,1,0.331702,True,False,False,True,False,False
18443,2014,0,1.0,2.0,1,1,40,2,1.0,1.0,7.098032,1125,0,0,0,4.83,0,1,1,0,0,0.00,0,1.0,1,0.164493,True,False,False,False,True,False
18444,2012,0,1.0,4.0,1,1,20,2,1.0,1.0,5.781360,1125,17,0,0,4.90,1,1,0,1,0,0.26,1,1.0,0,0.389341,False,True,False,True,False,False
18445,2019,0,1.0,1.0,1,0,20,1,1.0,1.0,6.965784,1125,0,0,0,4.83,0,1,0,1,0,0.00,1,1.0,1,0.275470,False,True,False,False,True,False


In [97]:
short_df = short_df.loc[short_df['host_since'] < 2023]

In [98]:
#Update CSVs
short_df.to_csv('../data/training_set_prepped.csv', index=False)
test_set.to_csv('../data/test_set_prepped.csv', index=False)