In [1]:
#FUNCTIONS

#Creating a function to extract and clean a given dataset
def cleanData(data):
    #Creating a new dataframe
    cleaned = pd.DataFrame()
    #Extracts feature data into the cleaned dataframe
    for string in extracted_features:
        cleaned[string] = data[string]
        #Interpolating NaN values with the most common occurence (mode) for categorical data
        cleaned[string] = cleaned[string].fillna(cleaned[string].mode()[0])
    return cleaned

#Prep the cleaned data for the KNN Algorithm (Label Encoding, Standardizing Features, turning all values into numerical)
def prepData(data):
    #Creating prepped data datafram
    prepped = data.copy()
    #Creating label encoding and standard scalar
    le = pp.LabelEncoder()
    sc = pp.StandardScaler()
    for x in categorical_features:
        prepped[x] = le.fit_transform(data[x])
    #Changing percentages and dollar values to floating points
    for x in numerical_features:
        prepped[x] = prepped[x].map(lambda x: x.lstrip('$').rstrip('%'))
    #Changing amenities to the amount of amenities a listing may have
    prepped['amenities'] = prepped['amenities'].map(lambda x: len(x.split(',')))
    #Changing bathrooms to amount of bathrooms a listing has
    prepped['bathrooms_text'] = prepped['bathrooms_text'].map(lambda x: re.findall("\d+\.\d+|\d+", x+'1')[0])
    #Standardizing features
    prepped['price'] = prepped['price'].map(lambda x: float(x.replace(",",""))) #Converts price to a float
    for x in standardize_features:
        data = prepped[x].values.reshape(-1,1)
        prepped[x] = sc.fit_transform(data)
    return prepped

In [2]:
#Importing Pandas
import pandas as pd
import numpy as np
import re
from sklearn import preprocessing as pp
#Reading raw data
amst_raw = pd.read_csv("Amsterdam_Raw.csv")
la_raw = pd.read_csv("LosAngeles_Raw.csv")
mel_raw = pd.read_csv("Melbourne_Raw.csv")

In [3]:
#Viewing all features given in the dataset given by Airbnb
mel_raw.iloc[0]

id                                                                                           9835
listing_url                                                     https://www.airbnb.com/rooms/9835
scrape_id                                                                          20201011211031
last_scraped                                                                           2020-10-14
name                                                                       Beautiful Room & House
description                                     <b>The space</b><br />House: Clean, New, Moder...
neighborhood_overview                                Very safe! Family oriented. Older age group.
picture_url                                     https://a0.muscache.com/pictures/44620/5a5815c...
host_id                                                                                     33057
host_url                                                  https://www.airbnb.com/users/show/33057
host_name           

In [4]:
#Creating a list of features we want to extract ("Review_Scores_Rating is the target variable")
extracted_features = ['host_response_time','host_response_rate','host_acceptance_rate','host_is_superhost',
                      'property_type','room_type','accommodates','bathrooms_text','bedrooms','beds',
                      'amenities','price','number_of_reviews','review_scores_rating']

#Creating a list of categorical data to be encoded
categorical_features = ['host_response_time', 'host_is_superhost','property_type','room_type']

#Creating a list of features that are strings but need to change to a numerical type
numerical_features = ['host_response_rate','host_acceptance_rate','price']

#Features to Standardize
standardize_features = ['host_response_rate','host_acceptance_rate','price', 'beds', 'amenities', 'number_of_reviews']


In [5]:
#Cleaning the data and exporting them into new .csv files
amst_cleaned = cleanData(amst_raw)
la_cleaned = cleanData(la_raw)
mel_cleaned = cleanData(mel_raw)

amst_cleaned.to_csv("Amsterdam_Cleaned.csv")
la_cleaned.to_csv("LosAngeles_Cleaned.csv")
mel_cleaned.to_csv("Melbourne_Cleaned.csv")

In [6]:
#Prepping data for K algorithm and exporting them into new .csv files
amst_prepped = prepData(amst_cleaned)
la_prepped = prepData(la_cleaned)
mel_prepped = prepData(mel_cleaned)

amst_prepped.to_csv("Amsterdam_Prepped.csv")
la_prepped.to_csv("LosAngeles_Prepped.csv")
mel_prepped.to_csv("Melbourne_Prepped.csv")

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,review_scores_rating
0,3,0.333718,-3.828231,0,12,0,6,2,2.0,0.697939,-0.119936,-0.176421,-0.503328,80.0
1,2,0.333718,0.543463,1,30,2,1,1.5,1.0,-0.583721,1.499665,-0.244335,-0.124552,97.0
2,3,-0.111806,-1.642384,0,30,2,1,1,1.0,-0.583721,-0.828512,-0.108507,-0.215458,94.0
3,3,0.333718,0.543463,0,8,0,2,1,1.0,-0.583721,-0.322387,-0.286781,-0.533630,100.0
4,2,-0.278878,0.281161,1,46,2,3,1,1.0,-0.583721,-0.524837,-0.252824,2.951109,95.0
5,2,-0.278878,0.281161,1,18,0,4,1,2.0,0.057109,0.588639,-0.142464,1.405703,93.0
6,2,0.333718,0.543463,1,30,2,1,1.5,1.0,0.057109,1.195989,-0.247731,-0.200307,97.0
7,3,0.333718,0.543463,0,48,2,2,1,1.0,-0.583721,-0.524837,-0.278292,-0.351817,98.0
8,3,0.333718,0.456029,1,16,0,4,1,1.0,0.057109,2.208240,-0.220565,3.117771,99.0
9,3,0.333718,-0.330876,1,16,0,2,1,1.0,0.057109,-0.221162,-0.164536,5.541937,94.0
