# Crawler for Rebohurrwoning.nl and notification via Telegram bot

## LIBRARIES

In [None]:
import pandas as pd
import cfscrape
from lxml import etree
from datetime import datetime
import requests
pd.set_option('display.max_colwidth', -1)

## FUNCTION TO SEND TELEGRAM MESSAGE!

In [None]:
def telegram_bot_sendtext(bot_message):
    bot_token = 
    bot_chatID = 
    send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message
    response = requests.get(send_text)
    return response.json()

# for emoji: https://apps.timwhitlock.info/emoji/tables/unicode

## VALID HEADER FOR THE HTTP REQUEST

In [None]:
header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
              'Accept-Encoding': 'gzip, deflate, sdch',
              'Accept-Language' : 'nl-NL,nl;q=0.8,en-US;q=0.6,en;q=0.4',
              'Cache-Control' : 'max-age=0',
              'Connection': 'keep-alive',
              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36'}

## URL EXAMPLE FOR REBOHURRWONING

In [None]:
url = "https://www.rebohuurwoning.nl/nl/aanbod/?location=Utrecht&location_params=street%3DAmerikalaan%26city%3DUtrecht%26country%3DNederland%26type%3Dcity&price_min=-1&price_max=-1&property_type=-1&property_subtype=-1&sorting=date_desc"

## INSTANTIATING SCRAPER 

In [None]:
scraper = cfscrape.create_scraper()  
scraped_html=scraper.get(url,headers=header).content
# print(scraped_html.decode("utf-8") )
html = etree.HTML(scraped_html)

## GETTING ALL LINKS IN THE FIRST PAGE

In [None]:
df_all_links = pd.DataFrame(html.xpath("//a/@href"))
links = df_all_links[df_all_links[0].str.contains('aanbod')].iloc[1:].apply(lambda x: 'https://www.rebohuurwoning.nl'+x).reset_index(drop=True)
# links.head()

## LOOP FOR SCRAPING THE CONTENT OF EACH LINK

In [None]:
# df = pd.DataFrame(columns=['link',
#                            'address',
#                            'price',
#                            'slaapkamers',
#                            'kamers',
#                            'woonoppervlakte',
#                            'oplevering',
#                            'first_seen',
#                            'last_seen',
#                           'type_woning'])

In [None]:
df = pd.read_csv('finding_house.csv').drop(['Unnamed: 0'],axis=1)

In [None]:
df_notify = pd.DataFrame()

for link in links[0]:
    # if the link already exist, update the last seen date!
    if len(df[df['link']==link])>0:
        index = df[df['link']==link].index[0].item()
        df.at[index, 'last_seen'] = datetime.now().strftime('%Y-%m-%d %H:%M')
        continue
    
    #INSTANTIATING THE CRAWLER FOR THE NEW LINK
    scraped_html=scraper.get(link,headers=header).content
    html = etree.HTML(scraped_html)
    
    first_seen = datetime.now().strftime('%Y-%m-%d %H:%M')
    last_seen = first_seen
    
    try:
        address = html.xpath("//title/text()")[0].replace('Te huur: ', '')
    except:
        address = ""
        
    try:
        price = html.xpath("//div[text()='Huurprijs']/following-sibling::div/text()")[0].replace('€ ','').replace(',- \xa0','').replace('.','')
    except:
        price = ""
    
    try:
        slaapkamers = html.xpath("//div[text()='Aantal Slaapkamers']/following-sibling::div/text()")[0]
    except:
        slaapkamers = ""
        
    try:
        kamers = html.xpath("//div[text()='Aantal kamers']/following-sibling::div/text()")[0]
    except:
        kamers = ""
    
    try:
        woonoppervlakte = html.xpath("//div[text()='Woonoppervlakte']/following-sibling::div/text()")[0].replace(' m2','')
    except:
        woonoppervlakte = ""
    
    try:
        oplevering = html.xpath("//div[text()='Oplevering']/following-sibling::div/text()")[0]
    except:
        oplevering = ""
        
    try:
        type_woning = html.xpath("//div[text()='Soort woning']/following-sibling::div/text()")[0]
    except:
        type_woning = ""   
        

    df = df.append({'link':link,
                    'address': address,  
                    'price': '€{:,.0f}'.format(float(price)), 
                    'slaapkamers':slaapkamers,
                    'kamers':kamers,
                    'woonoppervlakte':woonoppervlakte,
                    'oplevering':oplevering,
                   'first_seen':first_seen,
                   'last_seen':last_seen,
                   'type_woning':type_woning},            
                    ignore_index=True)
    
    df_notify = df_notify.append({'link':link,
                                  'address': address,  
                                  'price': '€{:,.0f}'.format(float(price)), 
                                  'slaapkamers':slaapkamers,
                                  'kamers':kamers,
                                  'woonoppervlakte':woonoppervlakte,
                                  'oplevering':oplevering,
                                  'first_seen':first_seen,
                                  'last_seen':last_seen,
                                  'type_woning':type_woning},            
                                 ignore_index=True)

if len(df_notify)>0:
    telegram_bot_sendtext(u'\U0001F61C'+"NEW LINKS ON REBO'S WEBSITE:\n")
    for index, row in df_notify.iterrows():
    telegram_bot_sendtext('Price:'+str(row['price'])+'\n'+
                          'Woonoppervlakte: '+str(row['woonoppervlakte']).replace('nan','NOT SPECIFIED').replace('.0','m2')+'\n'+
                          'Slaapkamers: '+str(row['slaapkamers']).replace('nan','NOT SPECIFIED').replace('.0','')+'\n'+
                          str(row['link']))
else:
    telegram_bot_sendtext(u'\U0001F613'+"THERE ARE NO NEW ENTRIES AT REBO'S WEBSITE!!")  

In [None]:
df.to_csv('finding_house.csv')