In [1]:
import pandas as pd
import geopandas as gpd
import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import random 

In [2]:
# listings.csv
listings = pd.read_csv("./data/denver_03_29_2021/listings.csv")
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,177,https://www.airbnb.com/rooms/177,20210329150448,2021-03-30,Tiny Home in the Heart of the City- ECO FRIENDLY,"160 sq ft + 80 sq ft loft for sleeping, Sleeps...","Quiet neighborhood next to park, creeks and bi...",https://a0.muscache.com/pictures/a1745ab2-b3a0...,615,https://www.airbnb.com/users/show/615,...,10.0,10.0,9.0,2016-BFN-0004968,t,2,2,0,0,1.4
1,360,https://www.airbnb.com/rooms/360,20210329150448,2021-03-30,Sit in the Peaceful Garden of the Chickadee Co...,Enjoy the famous Colorado weather and unplug i...,The cottage is located in the center of Lower ...,https://a0.muscache.com/pictures/monet/Select-...,666,https://www.airbnb.com/users/show/666,...,10.0,10.0,10.0,2017-BFN-0002177,f,2,2,0,0,4.34
2,364,https://www.airbnb.com/rooms/364,20210329150448,2021-03-31,Lodo / RiNo LOFT via airport train,"Modern 1,000 square foot loft in the heart of ...","Ten brewpubs within walking distance, two grea...",https://a0.muscache.com/pictures/11766413/a2c5...,783,https://www.airbnb.com/users/show/783,...,10.0,9.0,9.0,,f,1,1,0,0,0.6
3,590,https://www.airbnb.com/rooms/590,20210329150448,2021-04-02,Comfortable - and a great value!,"Large guest room in my home, where I also live...",I love the diversity of my neighborhood and it...,https://a0.muscache.com/pictures/110931/30991c...,933,https://www.airbnb.com/users/show/933,...,10.0,10.0,10.0,2021-BFN-0000578,f,2,0,2,0,4.14
4,592,https://www.airbnb.com/rooms/592,20210329150448,2021-03-31,private,This room is in the basement. It does not hav...,,https://a0.muscache.com/pictures/110942/171dd0...,933,https://www.airbnb.com/users/show/933,...,10.0,10.0,10.0,2021-BFN-0000578,f,2,0,2,0,1.06


## Data Cleaning

In [3]:
# remove unneeded columns
listings = listings.drop(["picture_url", "host_url", "host_thumbnail_url", "host_picture_url", "license"], axis=1)

# TODO: replace all empty columns with NaN

# change columns with 't' & 'f' to booleans
def convert_to_bool(x):
    if x == "t":
        return True
    elif x == "f":
        return False
    else:
        return np.NaN

listings["host_is_superhost"] = listings["host_is_superhost"].apply(lambda x: convert_to_bool(x))
listings["host_identity_verified"] = listings["host_identity_verified"].apply(lambda x: convert_to_bool(x))
listings["host_has_profile_pic"] = listings["host_has_profile_pic"].apply(lambda x: convert_to_bool(x))
listings["has_availability"] = listings["has_availability"].apply(lambda x: convert_to_bool(x))
listings["instant_bookable"] = listings["instant_bookable"].apply(lambda x: convert_to_bool(x))

# Clean up the "bathrooms_text" column to make two new columns - number of bathrooms, and a boolean column to indicate if bthrooms are private
def get_bathrooms_num(x):
    if type(x) != str:
        return 0.0
    elif x == "Shared half-bath" or x == "Half-bath":
        return 0.5
    elif "shared bath" in x:
        return float(x.strip("s").strip("shared bath"))
    elif "private bath" in x:
        return float(x.strip("s").strip("private bath"))
    elif "bath" in x:
        return float(x.strip("s").strip("bath"))
    else:
        return 0.0
    
def get_is_private_bathroom(x):
    if type(x) != str:
        return np.NaN
    elif "shared" in x:
        return False
    else:
        return True

listings["bathrooms"] = listings["bathrooms_text"].apply(lambda x: get_bathrooms_num(x))
listings["private_bathroom"] = listings["bathrooms_text"].apply(lambda x: get_is_private_bathroom(x))
listings = listings.drop("bathrooms_text", axis=1)

# display head to show success
listings.head()


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,host_id,host_name,host_since,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,private_bathroom
0,177,https://www.airbnb.com/rooms/177,20210329150448,2021-03-30,Tiny Home in the Heart of the City- ECO FRIENDLY,"160 sq ft + 80 sq ft loft for sleeping, Sleeps...","Quiet neighborhood next to park, creeks and bi...",615,Joe,2008-07-07,...,10.0,10.0,9.0,True,2,2,0,0,1.4,True
1,360,https://www.airbnb.com/rooms/360,20210329150448,2021-03-30,Sit in the Peaceful Garden of the Chickadee Co...,Enjoy the famous Colorado weather and unplug i...,The cottage is located in the center of Lower ...,666,Jennifer & Giovanni,2008-07-08,...,10.0,10.0,10.0,False,2,2,0,0,4.34,True
2,364,https://www.airbnb.com/rooms/364,20210329150448,2021-03-31,Lodo / RiNo LOFT via airport train,"Modern 1,000 square foot loft in the heart of ...","Ten brewpubs within walking distance, two grea...",783,Jason,2008-07-11,...,10.0,9.0,9.0,False,1,1,0,0,0.6,True
3,590,https://www.airbnb.com/rooms/590,20210329150448,2021-04-02,Comfortable - and a great value!,"Large guest room in my home, where I also live...",I love the diversity of my neighborhood and it...,933,Jill,2008-07-21,...,10.0,10.0,10.0,False,2,0,2,0,4.14,False
4,592,https://www.airbnb.com/rooms/592,20210329150448,2021-03-31,private,This room is in the basement. It does not hav...,,933,Jill,2008-07-21,...,10.0,10.0,10.0,False,2,0,2,0,1.06,False


In [4]:
hosts = pd.DataFrame(columns = ["listing_ids", "host_name", "host_since", "host_location", "host_about", "host_response_time", "host_response_rate", "host_acceptance_rate", "host_is_superhost", "host_neighbourhood", "host_listings_count", "host_total_listings_count", "host_verifications", "host_has_profile_pic", "host_identity_verified", "listing_neighbourhood"])

for index, row in listings.iterrows():
    if row['host_id'] not in hosts.index:
        hosts.loc[row['host_id']] = [[row['id']], row['host_name'], row['host_since'], row['host_location'], row['host_about'], row['host_response_time'], row['host_response_rate'], row['host_acceptance_rate'], row['host_is_superhost'], row['host_neighbourhood'], row['host_listings_count'], row['host_total_listings_count'], row['host_verifications'], row['host_has_profile_pic'], row['host_identity_verified'], row["neighbourhood_cleansed"]]
    else: # is host_id already in array, append listing_id to array
        hosts.loc[row['host_id']]['listing_ids'] = hosts.loc[row['host_id']]['listing_ids'].append(row['id'])
        #hosts.loc[row['host_id']]['listing_neighbourhoods'] = hosts.loc[row['host_id']]['listing_neighbourhoods'].append(row['neighbourhood'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [5]:
hosts.head()

Unnamed: 0,listing_ids,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,listing_neighbourhood
615,"[177, 29856627]",Joe,2008-07-07,"Denver, Colorado, United States","I'm originally from Oklahoma, but have lived i...",within an hour,100%,100%,True,Virginia Village,2.0,2.0,"['email', 'phone', 'google', 'reviews', 'kba']",True,False,Virginia Village
666,"[360, 39405]",Jennifer & Giovanni,2008-07-08,"Denver, Colorado, United States",We are artists and tinkerers.\r\n \r\nWe enjoy...,within an hour,100%,91%,True,Highland,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",True,True,Highland
783,[364],Jason,2008-07-11,"Denver, Colorado, United States","I travel pretty frequently, I spend a lot of t...",,,,False,Five Points,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",True,True,Five Points
933,"[590, 592]",Jill,2008-07-21,"Denver, Colorado, United States",I am friendly and I love meeting people from a...,within an hour,100%,83%,True,North Park Hill,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",True,True,North Park Hill
990,[686],Alexandra,2008-07-23,"Denver, Colorado, United States","Denver native, former teacher, musician, chapl...",within a few hours,100%,40%,False,North Capitol Hill,1.0,1.0,"['email', 'phone', 'reviews', 'kba']",True,True,North Capitol Hill


In [6]:
hosts.dtypes

listing_ids                   object
host_name                     object
host_since                    object
host_location                 object
host_about                    object
host_response_time            object
host_response_rate            object
host_acceptance_rate          object
host_is_superhost             object
host_neighbourhood            object
host_listings_count          float64
host_total_listings_count    float64
host_verifications            object
host_has_profile_pic          object
host_identity_verified        object
listing_neighbourhood         object
dtype: object

In [7]:
# drop any rows with na
#hosts = hosts.dropna()
hosts.isnull().sum()

listing_ids                    0
host_name                    116
host_since                   116
host_location                119
host_about                   886
host_response_time           407
host_response_rate           407
host_acceptance_rate         330
host_is_superhost            116
host_neighbourhood           421
host_listings_count          116
host_total_listings_count    116
host_verifications             0
host_has_profile_pic         116
host_identity_verified       116
listing_neighbourhood          0
dtype: int64

In [8]:
# Fill columns with generic values
hosts["host_name"] = hosts["host_name"].fillna("Bob")
hosts["host_since"] = hosts["host_since"].fillna(random.randint(2004, 2021))
hosts["host_location"] = hosts["host_location"].fillna("United States")
hosts["host_about"] = hosts["host_about"].fillna("I am a host on AirBnb") # not using the column anyway so doesnt matter
hosts["host_response_time"] = hosts["host_response_time"].fillna("within a day")
hosts["host_response_rate"] = hosts["host_response_rate"].fillna("80%") # TODO: Use average
hosts["host_acceptance_rate"] = hosts["host_acceptance_rate"].fillna("80%") # TODO: Use average
hosts["host_neighbourhood"] = hosts["host_neighbourhood"].fillna(hosts["listing_neighbourhood"]) 
hosts["host_listings_count"] = hosts["host_listings_count"].fillna(len(hosts["listing_ids"])) # TODO: Change to use length of linsting_ids array
hosts["host_total_listings_count"] = hosts["host_total_listings_count"].fillna(hosts["host_listings_count"]) # TODO: Change to use length of linsting_ids array
hosts["host_has_profile_pic"] = hosts["host_has_profile_pic"].fillna(False)
hosts["host_identity_verified"] = hosts["host_identity_verified"].fillna(False)

In [9]:
hosts.isnull().sum()

listing_ids                    0
host_name                      0
host_since                     0
host_location                  0
host_about                     0
host_response_time             0
host_response_rate             0
host_acceptance_rate           0
host_is_superhost            116
host_neighbourhood             0
host_listings_count            0
host_total_listings_count      0
host_verifications             0
host_has_profile_pic           0
host_identity_verified         0
listing_neighbourhood          0
dtype: int64

In [10]:
# drop any rows with na, only ones left should be "is_super_host". Since that's our dependent variable we don't want to mess with that
hosts = hosts.dropna()

In [11]:
# Convery dependent variable to binary
hosts["host_is_superhost"] = hosts["host_is_superhost"].apply(lambda x: 1 if True else 0)

# Convert host_since field to just the year
hosts["host_since"] = hosts["host_since"].apply(lambda x: int(str(x).split("-")[0]))

# Convert host_location to "host_is_in_denver" boolean field. Since every host will either be in Denver, or not in Denver
hosts["host_is_in_denver"] = hosts["host_location"].apply(lambda x: True if "Denver" in x else False)
#hosts.drop("host_location")

# Convert percentages to floats
def p2f(x):
    return float(x.strip('%'))/100

hosts["host_response_rate"] = hosts["host_response_rate"].apply(lambda x: p2f(x))
hosts["host_acceptance_rate"] = hosts["host_acceptance_rate"].apply(lambda x: p2f(x))

numFeatures = ["host_since", "host_response_rate", "host_acceptance_rate", "host_listings_count", "host_total_listings_count"]
numTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Pipeline for converting categorical columns
catFeatures = ["host_response_time", "host_neighbourhood", "host_has_profile_pic", "host_identity_verified", "host_is_in_denver"]
catTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [12]:
hosts.head()

Unnamed: 0,listing_ids,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,listing_neighbourhood,host_is_in_denver
615,"[177, 29856627]",Joe,2008,"Denver, Colorado, United States","I'm originally from Oklahoma, but have lived i...",within an hour,1.0,1.0,1,Virginia Village,2.0,2.0,"['email', 'phone', 'google', 'reviews', 'kba']",True,False,Virginia Village,True
666,"[360, 39405]",Jennifer & Giovanni,2008,"Denver, Colorado, United States",We are artists and tinkerers.\r\n \r\nWe enjoy...,within an hour,1.0,0.91,1,Highland,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",True,True,Highland,True
783,[364],Jason,2008,"Denver, Colorado, United States","I travel pretty frequently, I spend a lot of t...",within a day,0.8,0.8,1,Five Points,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",True,True,Five Points,True
933,"[590, 592]",Jill,2008,"Denver, Colorado, United States",I am friendly and I love meeting people from a...,within an hour,1.0,0.83,1,North Park Hill,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",True,True,North Park Hill,True
990,[686],Alexandra,2008,"Denver, Colorado, United States","Denver native, former teacher, musician, chapl...",within a few hours,1.0,0.4,1,North Capitol Hill,1.0,1.0,"['email', 'phone', 'reviews', 'kba']",True,True,North Capitol Hill,True


In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numTransformer, numFeatures),
        ('cat', catTransformer, catFeatures)])

In [14]:
preprocessor.fit_transform(hosts)

<2156x143 sparse matrix of type '<class 'numpy.float64'>'
	with 21560 stored elements in Compressed Sparse Row format>

In [16]:
# Split data into test_train

from sklearn.model_selection import train_test_split

# Dependent variable
y = hosts['host_is_superhost']

# Remaining features are our independent variables :
# TODO: Convert host_verifications into some expanded columns
X = hosts.drop(['host_is_superhost', "host_location", "listing_ids", "host_name", "host_about", "host_verifications", "listing_neighbourhood"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=0)

In [18]:
#print(X_train)

In [20]:
from sklearn.ensemble import RandomForestClassifier

# set up pipeline with proprocessor & RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [21]:
rf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['host_since',
                                                   'host_response_rate',
                                                   'host_acceptance_rate',
                                                   'host_listings_count',
                                                   'host_total_listings_count']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                             

## Model Evaluation

In [22]:
y_pred = rf.predict(X_test)
#y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

classifiers = [
    KNeighborsClassifier(3),
    #SVC(kernel="rbf", C=0.025, probability=True),
    #NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    # GradientBoostingClassifier()
    ]

for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test), "\n\n")

KNeighborsClassifier(n_neighbors=3)
model score: 1.000 


DecisionTreeClassifier()
model score: 1.000 


RandomForestClassifier()
model score: 1.000 


AdaBoostClassifier()
model score: 1.000 


