# Crawler for Rebohurrwoning.nl and notification via Telegram bot

## LIBRARIES

In [93]:
import pandas as pd
import cfscrape
from lxml import etree
from datetime import datetime
pd.set_option('display.max_colwidth', -1)

## FUNCTION TO SEND TELEGRAM MESSAGE!

In [177]:
import requests
def telegram_bot_sendtext(bot_message):
    bot_token = '<YOUR_TOKEN>'
    bot_chatID = '<YOUR_CHATID>'
    send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message
    response = requests.get(send_text)
    return response.json()

# for emoji: https://apps.timwhitlock.info/emoji/tables/unicode

## VALID HEADER FOR THE HTTP REQUEST

In [None]:
header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
              'Accept-Encoding': 'gzip, deflate, sdch',
              'Accept-Language' : 'nl-NL,nl;q=0.8,en-US;q=0.6,en;q=0.4',
              'Cache-Control' : 'max-age=0',
              'Connection': 'keep-alive',
              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36'}

## URL EXAMPLE FOR REBOHURRWONING

In [2]:
url = "https://www.rebohuurwoning.nl/nl/aanbod/?location=Utrecht&location_params=street%3DAmerikalaan%26city%3DUtrecht%26country%3DNederland%26type%3Dcity&price_min=-1&price_max=-1&property_type=-1&property_subtype=-1&sorting=date_desc"

## INSTANTIATING SCRAPER 

In [3]:
scraper = cfscrape.create_scraper()  
scraped_html=scraper.get(url,headers=header).content
# print(scraped_html.decode("utf-8") )
html = etree.HTML(scraped_html)

<!DOCTYPE html>
<html lang="nl">
<head>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-K2S2VJV');</script>
<title>Woningen te huur in Utrecht, Zwolle Amersfoort, Deventer, Dieren</title>
<meta charset="utf-8">
<meta name="description" content="Op zoek naar een huurwoning? ☑️ REBO heeft het grootste aanbod woningen in de regio Utrecht, Zwolle, Amersfoort, Deventer en Dieren ▶️ ">
<meta name="keywords" content="">
<meta name="viewport" content="width=device-width, initial-scale=1.0">

<script src="/lib/js/jquery/jquery.min.js"></script><link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,200,200italic,300,300italic,400italic,600,600italic,700italic,700,900,900italic" rel=

## GETTING ALL LINKS IN THE FIRST PAGE

In [41]:
df_all_links = pd.DataFrame(html.xpath("//a/@href"))
links = df_all_links[df_all_links[0].str.contains('aanbod')].iloc[1:].apply(lambda x: 'https://www.rebohuurwoning.nl'+x).reset_index(drop=True)
links.head()

## LOOP FOR SCRAPING THE CONTENT OF EACH LINK

In [179]:
# df = pd.DataFrame(columns=['link',
#                            'address',
#                            'price',
#                            'slaapkamers',
#                            'kamers',
#                            'woonoppervlakte',
#                            'oplevering',
#                            'first_seen',
#                            'last_seen',
#                           'type_woning'])

In [192]:
df = pd.read_csv('finding_house.csv').drop(['Unnamed: 0'],axis=1)

In [194]:
df_notify = pd.DataFrame()

for link in links[0]:
    # if the link already exist, update the last seen date!
    if len(df[df['link']==link])>0:
        index = df[df['link']==link].index[0].item()
        df.at[index, 'last_seen'] = datetime.now().strftime('%Y-%m-%d %H:%M')
        continue
    
    #INSTANTIATING THE CRAWLER FOR THE NEW LINK
    scraped_html=scraper.get(link,headers=header).content
    html = etree.HTML(scraped_html)
    
    first_seen = datetime.now().strftime('%Y-%m-%d %H:%M')
    last_seen = first_seen
    
    try:
        address = html.xpath("//title/text()")[0].replace('Te huur: ', '')
    except:
        address = ""
        
    try:
        price = html.xpath("//div[text()='Huurprijs']/following-sibling::div/text()")[0].replace('€ ','').replace(',- \xa0','').replace('.','')
    except:
        price = ""
    
    try:
        slaapkamers = html.xpath("//div[text()='Aantal Slaapkamers']/following-sibling::div/text()")[0]
    except:
        slaapkamers = ""
        
    try:
        kamers = html.xpath("//div[text()='Aantal kamers']/following-sibling::div/text()")[0]
    except:
        kamers = ""
    
    try:
        woonoppervlakte = html.xpath("//div[text()='Woonoppervlakte']/following-sibling::div/text()")[0].replace(' m2','')
    except:
        woonoppervlakte = ""
    
    try:
        oplevering = html.xpath("//div[text()='Oplevering']/following-sibling::div/text()")[0]
    except:
        oplevering = ""
        
    try:
        type_woning = html.xpath("//div[text()='Soort woning']/following-sibling::div/text()")[0]
    except:
        type_woning = ""   
        

    df = df.append({'link':link,
                    'address': address,  
                    'price': '€{:,.0f}'.format(float(price)), 
                    'slaapkamers':slaapkamers,
                    'kamers':kamers,
                    'woonoppervlakte':woonoppervlakte,
                    'oplevering':oplevering,
                   'first_seen':first_seen,
                   'last_seen':last_seen,
                   'type_woning':type_woning},            
                    ignore_index=True)
    
    df_notify = df_notify.append({'link':link,
                                  'address': address,  
                                  'price': '€{:,.0f}'.format(float(price)), 
                                  'slaapkamers':slaapkamers,
                                  'kamers':kamers,
                                  'woonoppervlakte':woonoppervlakte,
                                  'oplevering':oplevering,
                                  'first_seen':first_seen,
                                  'last_seen':last_seen,
                                  'type_woning':type_woning},            
                                 ignore_index=True)

if len(df_notify)>0:
    telegram_bot_sendtext(u'\U0001F61C'+"NEW LINKS ON REBO'S WEBSITE:\n"+"\n".join(df_notify['link']))
else:
    telegram_bot_sendtext(u'\U0001F613'+"THERE ARE NO NEW ENTRIES AT REBO'S WEBSITE!!")  

In [196]:
df.to_csv('finding_house.csv')

In [197]:
df

Unnamed: 0,link,address,price,slaapkamers,kamers,woonoppervlakte,oplevering,first_seen,last_seen,type_woning
0,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-van-der-marckstraat-31-1888,"Van der Marckhof 31, Utrecht","€1,315",4.0,,130.0,01-12-2019,2019-09-22 00:27,2019-09-22 00:38,Eengezinswoning Tussenwoning
1,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-jongeneelstraat-32-1885,Utrecht Jongeneelstraat 32,"€1,965",3.0,,133.0,,2019-09-22 00:27,2019-09-22 00:38,Appartement
2,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-laan-van-nieuw-guinea-91-bis-1862,"Laan van Nieuw Guinea 91 BIS, Utrecht",€950,3.0,,,Direct,2019-09-22 00:27,2019-09-22 00:38,Bovenwoning 4 kamer woning
3,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-rolderdiephof-122-1854,Utrecht Rolderdiephof 122,"€1,565",3.0,,117.0,,2019-09-22 00:27,2019-09-22 00:38,Massionette
4,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-john-coltranestraat-132-1851,"John Coltranestraat 132, Utrecht","€1,410",3.0,,162.0,,2019-09-22 00:27,2019-09-22 00:38,Appartement 4 kamer appartement
5,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-purpervlinderstraat-7-1850,Purpervlinderstraat 7,"€1,176",4.0,,130.0,,2019-09-22 00:27,2019-09-22 00:38,Eengezinswoning 6 kamer woning
6,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-appaloosastraat-5-1791,Utrecht Appaloosastraat 5,"€1,410",4.0,,126.0,01-09-2019,2019-09-22 00:27,2019-09-22 00:38,Tussenwoning
7,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-rolderdiephof-76-1788,Utrecht Rolderdiephof 76,"€1,685",2.0,,120.0,15/10/2019,2019-09-22 00:27,2019-09-22 00:38,Appartement
8,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-musicallaan-175-1637,"Musicallaan 175, Utrecht","€1,070",2.0,,135.0,01-11-2019,2019-09-22 00:27,2019-09-22 00:38,Appartement
9,https://www.rebohuurwoning.nl/nl/aanbod/utrecht-hollandse-toren-52-1636,Utrecht Hollandse Toren 52,"€1,807",3.0,,146.0,Per direct,2019-09-22 00:27,2019-09-22 00:38,Appartement
