In [2]:
import pandas as pd
import ast
import requests

In [3]:
pd.set_option('display.max_rows', None)

In [4]:
# Read 4 different listings datasets
df_march = pd.read_csv('data/input/listings_march.csv')
df_june = pd.read_csv('data/input/listings_june.csv')
df_september = pd.read_csv('data/input/listings_september.csv')
df_december = pd.read_csv('data/input/listings_december.csv')

In [5]:
# Combine data
df = pd.concat([df_march, df_june, df_september, df_december])

In [7]:
# Drop duplicates keeping the latest scraped items for ids with duplicates
df = df.sort_values(by=['id', 'last_scraped'], ascending=[True, False])
df = df.drop_duplicates(subset='id', keep='first')

In [None]:
# Select meaningful columns
df = df[["id", "last_scraped", "host_id", "host_since", "host_is_superhost", "neighbourhood_cleansed", "host_listings_count", "latitude", "longitude", "property_type", "room_type", "accommodates", "bathrooms", "bathrooms_text", "bedrooms", "beds", "amenities", "price", "number_of_reviews", "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "instant_bookable"]]

In [8]:
# Fill in null or empty values when it's possible
df['host_is_superhost'] = df['host_is_superhost'].fillna('f')
df['bedrooms'] = df['bedrooms'].fillna('1.0')
df['beds'] = df['beds'].fillna('1.0')

# For properties without reviews we will replace ratings with 0
df['review_scores_rating'] = df['review_scores_rating'].fillna('0')
df['review_scores_accuracy'] = df['review_scores_accuracy'].fillna('0')
df['review_scores_checkin'] = df['review_scores_checkin'].fillna('0')
df['review_scores_cleanliness'] = df['review_scores_cleanliness'].fillna('0')
df['review_scores_communication'] = df['review_scores_communication'].fillna('0')
df['review_scores_location'] = df['review_scores_location'].fillna('0')

# Convert price to float
df['price'] = df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

In [9]:
# Drop rows when filling in empty values is not possible
df = df.dropna(subset=['id', 'host_id', 'price'])

In [10]:
# We end up with quite limited dataset
df.shape

(7210, 26)

In [11]:
# For set of meaningful amenties we add one-hot-encoded columns
feature_list = [
    "Dishwasher", "Washer", "Dryer", "Microwave", "Freezer", "Private entrance", "Coffee maker",
    "Oven", "Outdoor dining area", "Private patio or balcony", "Luggage dropoff allowed", "Bathtub",
    "Blender", "Paid parking on premises", "Air conditioning", "Waterfront", "Pets allowed",
    "Canal view", "Free parking on premises", "Bikes", "Safe"
]

df['parsed_amenities'] = df['amenities'].apply(ast.literal_eval)

for feature in feature_list:
    df[feature] = df['parsed_amenities'].apply(lambda x: 1 if feature in x else 0)

df.drop('parsed_amenities', axis=1, inplace=True)

In [13]:
# Unfortunately neighbourhoods in this dataset do not match neighbourhoods in gemeente dataset
# So we will use postcode instead
df['neighbourhood_cleansed'].unique()

array(['Oostelijk Havengebied - Indische Buurt', 'Centrum-Oost',
       'Centrum-West', 'Bos en Lommer', 'Zuid', 'Oud-Oost',
       'De Pijp - Rivierenbuurt', 'Slotervaart', 'Noord-Oost',
       'De Baarsjes - Oud-West', 'Westerpark', 'Buitenveldert - Zuidas',
       'Watergraafsmeer', 'Oud-Noord', 'Noord-West',
       'Geuzenveld - Slotermeer', 'IJburg - Zeeburgereiland',
       'De Aker - Nieuw Sloten', 'Osdorp', 'Bijlmer-Centrum',
       'Gaasperdam - Driemond', 'Bijlmer-Oost'], dtype=object)

In [51]:
# Convert lat long to postcode
def get_geocode_result(lat, lon):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key=<api_key>"
    response = requests.get(url)

    if response.status_code == 200:
        result = response.json()
        if result['status'] == 'OK':
            for component in result['results'][0]['address_components']:
                if 'postal_code' in component['types']:
                    return component['long_name']
    return None

In [None]:
# Apply the function to each row
df['geocode_result'] = df.apply(lambda row: get_geocode_result(row['latitude'], row['longitude']), axis=1)
df['postcode'] = df['geocode_result'].str.extract('(\d+)').astype('string')

In [None]:
# Read postcode data from gemeente
postcodes = pd.read_csv('data/input/postcodes.csv')
postcodes['postcode'] = postcodes['postcode'].astype('string')

In [None]:
# Join airbnb data with gemeente data on postcode
result = pd.merge(df, postcodes, on='postcode', how='inner')

In [None]:
# Save data to CSV
result.to_csv('./data/output/result.csv', index=False)