# Crawler to retrieve houses from Funda.nl

<h1 align='center'>=========================================================</h1>

# INTEGRATION WITH GOOGLE DRIVE SPREADSHEET (into df_current)

In [149]:
google_json_key = 'buying-house-283614-38d7fde831c8.json'
filename = "buy house notes"
worksheet_name = "houses"
worksheet_name = "houses_hengelo"

In [150]:
import gspread 
from gspread_dataframe import get_as_dataframe, set_with_dataframe 
from oauth2client.service_account import ServiceAccountCredentials
from gspread_dataframe import get_as_dataframe, set_with_dataframe 

scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_json_key, scope) 
gc = gspread.authorize(credentials)

worksheet = gc.open(filename).worksheet(worksheet_name)
df_current = get_as_dataframe(worksheet,index_column=0)

<h1 align='center'>=========================================================</h1>

# PART II: defining a function to automatic crawler all houses to buy from funda.nl! retrieve only houses that are not in a oldcsv converted in a df

Crawl only URLs that are not in the CSV

In [151]:
import pandas as pd 

import cfscrape
from lxml import etree
header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
              'Accept-Encoding': 'gzip, deflate, sdch',
              'Accept-Language' : 'en-US,en;q=0.9,nl;q=0.8',
              'Cache-Control' : 'max-age=0',
              'Connection': 'keep-alive',
              'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
scraper = cfscrape.create_scraper()


def funda_automatic_crawler (city, price_max, df_oldfile):
    
    page = 1
    links = []
    
    while True:
        url="https://www.funda.nl/koop/"+str(city)+"/0-"+str(price_max)+"/woonhuis/p"+str(page)
#         print(url)
        scraped_html=scraper.get(url,headers=header).content
#         print(scraped_html.decode("utf-8"))
        html = etree.HTML(scraped_html)
#         print(html)
        
        df_links = pd.DataFrame(html.xpath("//a/@href"))
#         print(df_links)
        
        if len(df_links)==0:
            print("Funda may BLOCKED US!")
            break

        
#         print(df_links_unique)
        
        links.extend(df_links_unique[0].tolist())
        
        print("Links from page",page,"retrieved!")
        
        total_entries = int(html.xpath("//button[@class='mobile-search-sticky-button fd-btn fd-btn--primary']/span/text()")[0].replace("koopwoningen",'').strip())
#         print(total_entries)
        
        if page * 10 > total_entries:
            break
        else:
            page = page + 1
#   -------------------------------------------
#   -------------------------------------------
    links_unique = list(set(links))
    df = pd.DataFrame()
    for url in links_unique:
        if len(df_oldfile[df_oldfile['link'].str.contains(url)])>0:
            print(url,":alredy in the oldcsv")
            continue    
        
        scraped_html=scraper.get(url,headers=header).content.decode("utf-8") 
#         print(scraped_html.decode("utf-8"))
        html = etree.HTML(scraped_html)             
        
        try:
            address = html.xpath("//title/text()")[0].replace('Huis te koop: ','').replace('[funda]','')
        except:
            address = ""
            
        try:
            price = html.xpath("//strong[@class='object-header__price']/text()")[0].replace('€ ','').replace(' /mnd','').replace('.','').replace(' kk','').replace('von', '').replace(' ','')
        except:
            price = ""
            
        try:
            bouwjaar = html.xpath("//dl[@class='object-kenmerken-list']")[1].xpath("dd/text()")[2].replace('\r\n','').replace(' ','')
        except:
            bouwjaar =""
            
        try:
            kamers = html.xpath("//dl[@class='object-kenmerken-list']")[3].xpath("dd/text()")[0].replace('\r\n','')
        except:
            kamers = ""
        
        try:
            woonen_m2 = html.xpath("//dd[@class='object-kenmerken-group-list']/dl/dd[1]/text()")[0].replace('\r\n','').replace(' m²','').replace('.0','').replace('.','')
        except:
            woonen_m2 = ""
            
        try:
            perceel_m2 = html.xpath("//dd[@class='object-kenmerken-group-list']/dl/dd[1]/text()")[1].replace('\r\n','').replace(' m²','').replace('.0','').replace('.','').replace(' (deelperceel)','')
        except:
            perceel_m2 =""
        
        try:
            makelaars = html.xpath("//a[@class='object-contact-aanbieder-link']/text()")[0]
        except:
            makelaars = ""
        
        try:
            makelaars_phone = html.xpath("//span[@class='fd-completely-hidden fd-display-inline-block--bp-m']/text()")[0].replace('\r\n','').replace('-','').replace('(+31) ','').replace(' ','')
        except:
            makelaars_phone = ""
            
        try:
            energy = html.xpath("//span[contains(@class, 'energielabel')]/text()")[0].replace('\r','').replace('\n','').replace(' ','')
        except:
            energy = ""
        
        print("Retrieved:", address)
        df_temp = pd.DataFrame({'link':url,
                                'address':[address],
                                'city':[city],
                                'price':[price],
                                'bouwjaar': [bouwjaar],
                                'kamers': [kamers],
                                'woonen_m2':[woonen_m2],
                                'perceel_m2':[perceel_m2],
                                'makelaars': [makelaars],
                                'makelaars_phone':[makelaars_phone],
                               'energy':[energy]})
        
        df = pd.concat([df,df_temp], axis=0, ignore_index=True)
    
    try:
        df['slaapkamers'] = df['kamers'].apply(lambda x: str(x.split(' slaapkamers')[0])[-1] if 'slaapkamers' in str(x)  else str(x).replace(' kamer','').replace('s',''))
    except:
        df['slaapkamers'] = ""

    try:
        df['bouwjaar'] = df['bouwjaar'].apply(lambda x: x.split('-')[0])
    except:
        df['slaapkamers'] = ""
    return df

<h1 align='center'>=========================================================</h1>

## Using the function

**Note:** we use the df_current (coming from Google Drive Spreadsheet)

In [152]:
df_newentries = funda_automatic_crawler('hengelo-ov',550000,df_current)

Links from page 1 retrieved!
Links from page 2 retrieved!
Links from page 3 retrieved!
Links from page 4 retrieved!
Links from page 5 retrieved!
Links from page 6 retrieved!
Links from page 7 retrieved!
Links from page 8 retrieved!
Links from page 9 retrieved!
Links from page 10 retrieved!
Links from page 11 retrieved!
Links from page 12 retrieved!
Links from page 13 retrieved!
Links from page 14 retrieved!
Links from page 15 retrieved!
Links from page 16 retrieved!
Links from page 17 retrieved!
Links from page 18 retrieved!
Links from page 19 retrieved!
Links from page 20 retrieved!
Links from page 21 retrieved!
Links from page 22 retrieved!
Links from page 23 retrieved!
Links from page 24 retrieved!
Links from page 25 retrieved!
Links from page 26 retrieved!
Links from page 27 retrieved!
Links from page 28 retrieved!
Links from page 29 retrieved!
Links from page 30 retrieved!
Links from page 31 retrieved!
Links from page 32 retrieved!
Links from page 33 retrieved!
Links from page 34 

Retrieved: Paul van Kempenstraat 24 7558 HA Hengelo (OV) 
Retrieved: Albert Sommerstraat 17 7558 DW Hengelo (OV) 
Retrieved: Bouwnummer (Bouwnr. 6) 7556 PN Hengelo (OV) 
Retrieved: Athenestraat (Bouwnummer 10) 7552 BR Hengelo (OV) 
Retrieved: Industriestraat 208 7553 CW Hengelo (OV) 
Retrieved: Woolderesweg 38 7555 LA Hengelo (OV) 
Retrieved: 't Loo 1 7553 DE Hengelo (OV) 
Retrieved: Lindenweg 1 7556 HH Hengelo (OV) 
Retrieved: Frederikstraat 32 7553 KE Hengelo (OV) 
Retrieved: Thorbeckestraat 14 7553 AT Hengelo (OV) 
Retrieved: Bouwnummer (Bouwnr. 80) 7556 Hengelo (OV) 
Retrieved: C.T. Storkstraat 12 7553 AR Hengelo (OV) 
Retrieved: Deldenerstraat 171 7555 AB Hengelo (OV) 
Retrieved: Bouwnummer (Bouwnr. 46) 7556 Hengelo (OV) 
Retrieved: Josef Haydnlaan 18 7557 CS Hengelo (OV) 
Retrieved: Plein 1918 1 7553 BJ Hengelo (OV) 
Retrieved: Wethouder Kampstraat 177 7553 ZK Hengelo (OV) 
Retrieved: Wolter ten Catestraat 43 7551 HX Hengelo (OV) 
Retrieved: Arturo Toscaninistraat 18 7558 DX Heng

Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda
Retrieved: Robot - funda


In [154]:
df_newentries['id'] = ""
df_newentries['bike_time'] = ""
df_newentries['status'] = "new"
df_newentries['wos_waarde'] = ""
df_newentries['huispedia'] = ""

<h1 align='center'>=========================================================</h1> 

# PART III: enriching the data

# Bike Distance from/to SOMEWHERE

In [155]:
%run gmaps_bike_distance_duration.ipynb

In [156]:
# point_b = 'Enschede Railway Station, Enschede'
point_b = 'Stationsplein, Hengelo'
 
df_newentries['bike_time'] = df_newentries['address'].apply(lambda x: gmaps_bike_time(gmaps_key,x,point_b))

Paul van Kempenstraat 24 7558 HA Hengelo (OV)  -> Stationsplein, Hengelo : 14 mins
Albert Sommerstraat 17 7558 DW Hengelo (OV)  -> Stationsplein, Hengelo : 14 mins
Bouwnummer (Bouwnr. 6) 7556 PN Hengelo (OV)  -> Stationsplein, Hengelo : 11 mins
Athenestraat (Bouwnummer 10) 7552 BR Hengelo (OV)  -> Stationsplein, Hengelo : 5 mins
Industriestraat 208 7553 CW Hengelo (OV)  -> Stationsplein, Hengelo : 7 mins
Woolderesweg 38 7555 LA Hengelo (OV)  -> Stationsplein, Hengelo : 8 mins
't Loo 1 7553 DE Hengelo (OV)  -> Stationsplein, Hengelo : 5 mins
Lindenweg 1 7556 HH Hengelo (OV)  -> Stationsplein, Hengelo : 7 mins
Frederikstraat 32 7553 KE Hengelo (OV)  -> Stationsplein, Hengelo : 4 mins
Thorbeckestraat 14 7553 AT Hengelo (OV)  -> Stationsplein, Hengelo : 6 mins
Bouwnummer (Bouwnr. 80) 7556 Hengelo (OV)  -> Stationsplein, Hengelo : 8 mins
C.T. Storkstraat 12 7553 AR Hengelo (OV)  -> Stationsplein, Hengelo : 5 mins
Deldenerstraat 171 7555 AB Hengelo (OV)  -> Stationsplein, Hengelo : 6 mins
Bo

Willem Kloosstraat 47 7552 LW Hengelo (OV)  -> Stationsplein, Hengelo : 10 mins
Jan Steenstraat 24 7556 GC Hengelo (OV)  -> Stationsplein, Hengelo : 8 mins
Louis van Tulderstraat 59 7558 JM Hengelo (OV)  -> Stationsplein, Hengelo : 16 mins
Industriestraat 100 7553 CS Hengelo (OV)  -> Stationsplein, Hengelo : 5 mins
Oude Postweg 66 7557 DE Hengelo (OV)  -> Stationsplein, Hengelo : 6 mins
Rozenstraat 29 7555 CG Hengelo (OV)  -> Stationsplein, Hengelo : 5 mins
Bouwnummer (Bouwnr. 77) 7556 Hengelo (OV)  -> Stationsplein, Hengelo : 8 mins
Geerdinksweg 76 a 7555 DP Hengelo (OV)  -> Stationsplein, Hengelo : 6 mins
Eduard Flipsestraat 39 7558 DT Hengelo (OV)  -> Stationsplein, Hengelo : 14 mins
Bouwnummer (Bouwnr. 80) 7555 DL Hengelo (OV)  -> Stationsplein, Hengelo : 6 mins
Else Mauhsstraat 21 7558 RE Hengelo (OV)  -> Stationsplein, Hengelo : 17 mins
Bornsestraat 290 7556 BM Hengelo (OV)  -> Stationsplein, Hengelo : 8 mins
Oelerweg 93 7555 GL Hengelo (OV)  -> Stationsplein, Hengelo : 7 mins
At

<h1 align='center'>=========================================================</h1> 

## Merging the new entries with a Google drive workbook

In [157]:
df_merged = df_current.append(df_newentries).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


## Saving merged df into a Google drive workbook

In [158]:
set_with_dataframe(worksheet, df_merged[['id','status','link','address','bike_time','makelaars','makelaars_phone','price','huispedia','wos_waarde','bouwjaar','perceel_m2','woonen_m2','slaapkamers', 'energy']])