# Scraper

The whole 'tool' consists of two parts: a Python script for data retrieval and an HTML to display the data on a map.<br>
Run this notebook to scrape apartments. Replace in the HTML document :::API-KEY::: with your maptiler API key (https://cloud.maptiler.com/maps/).



In [1]:
# Import packages / functions, set path

import sys, os

import scrapeApartments as sap
from scrapeApartments import Parameters
import geopandas as gpd
from shapely.geometry import Point

PATH = '' # path to folder with the HTML file

In [2]:
# Define parameters for the scraping. Run the scraper and omit entries having certain keywords

class MyParameters(Parameters):
    PAGE = 'all'
    ROOMS_MIN = 1.5
    ROOMS_MAX = 50
    SIZE_MIN = 45
    SIZE_MAX = 1000
    PRICE_MIN = 800
    PRICE_MAX = 1700
    RADIUS = 0
    LOCATION = 'Zurich'
    
myprmtrs = MyParameters()
results = myprmtrs.scrape()
resultsCleaned = sap.cleanDescription(results, [' befristet', 'Befristet', 'WG', 'Mitbewohner', 'BEFRISTET', 'untermiete', 'Untermiete'])

print('===> scraped a total of {} relevant entries.\n\n'.format(len(resultsCleaned)))  


resultsGeo = sap.locate(resultsCleaned)
print("Following addresses must be corrected:\n")
for i,row in resultsGeo.loc[~(resultsGeo.lat > 0)].iterrows():
    print("Index: {}, Address: ".format(i) + row.address)    


Scraping: 'comparis'
Comparis accessed, max no. of pages: 8

Scraping: 'homegate'
Homegate accessed, max no. of pages: 3

Scraping: 'immoscout'
Immoscout accessed, max no. of pages: 2
===> scraped a total of 66 relevant entries.


Following addresses must be corrected:

Index: 11, Address: nahe Goldbrunnenplatz, 8003 Zürich
Index: 54, Address: nähe Schmiede Wiedikon, 8003 Zürich


Correct it as shown in the examples below and re-run locate():<br><br>
Index: 4, Address: nahe Goldbrunnenplatz, 8003 Zürich<br>
Index: 16, Address: Kaeferholzstrasse 42, 8057 Zürich<br>
Index: 80, Address: nahe Schmiede Wiedikon, 8003 Zürich<br>
<br><br>
resultsGeo.loc[4, 'address'] = "Goldbrunnenplatz 1, 8003 Zürich"<br>
resultsGeo.loc[16, 'address'] = "Käferholzstrasse 42, 8057 Zürich"<br>
resultsGeo.loc[80, 'address'] = "Schlossgasse 10, 8003 Zürich"


In [3]:
resultsGeo.loc[11, 'address'] = "Goldbrunnenplatz 1, 8003 Zürich"
resultsGeo.loc[54, 'address'] = "Schlossgasse 10, 8003 Zürich"

In [4]:
resultsGeo = sap.locate(resultsGeo)
for i,row in resultsGeo.loc[~(resultsGeo.lat > 0)].iterrows():
    print("Index: {}, Address: ".format(i) + row.address)   

In [5]:
# define location to commute to, get commute times and store dataframe as geojson

commuteAddress = "Zürich Hauptbahnhof, Zürich"

commuteLat, commuteLon = sap.locateAddress(commuteAddress)
resultsCommute = sap.getCommuteTimes(resultsGeo,commuteLat, commuteLon)
outpath = os.path.join(PATH,'destination.geojson')
sap.createTargetGEOJSON(outpath, commuteAddress, title='Zürich HB')

gdfPath = os.path.join(PATH,'wohnungen.geojson')
gdf = gpd.GeoDataFrame(resultsCommute, geometry=gpd.points_from_xy(resultsCommute.lon, resultsCommute.lat))


Retrieving commuting times
Commuting time retrieved for 10 of 66 entries.
Commuting time retrieved for 20 of 66 entries.
Commuting time retrieved for 30 of 66 entries.
Commuting time retrieved for 40 of 66 entries.
Commuting time retrieved for 50 of 66 entries.
Commuting time retrieved for 60 of 66 entries.
Commuting time retrieved for 66 of 66 entries.


In [6]:
x = [X for X in resultsGeo.lon.tolist()]
y = [Y for Y in resultsGeo.lat.tolist()]
centroid = [sum(x) / len( resultsGeo.lon.tolist()), sum(y) / len( resultsGeo.lat.tolist())]

centroidString = "var centerP = ["+str(centroid[1])+", "+str(centroid[0])+"]"
with open(os.path.join(PATH,"centerpoint.txt"), 'w') as file:
    file.write(centroidString)

geojsoned = gdf.to_json()
gdfOut = 'var dataset='+geojsoned
with open(gdfPath, 'w') as file:
    file.write(gdfOut)