In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np
import re
import requests
from random import *
from tqdm import tqdm #progress bar
from datetime import datetime
import os
import glob
import pickle #for saving data

In [2]:
def create_links_table(base_link = 'https://www.funda.nl/en/koop/heel-nederland/p', sleep_time = 3):

    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2} # this is to not load images
    chrome_options.add_experimental_option("prefs", prefs)

    #start a driver
    driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)
    driver.get(base_link + '1/')
    time.sleep(sleep_time)

    one_page_source = BeautifulSoup(driver.page_source, 'html.parser')

    try:
        pagination_list = one_page_source.find_all('div', {'class': 'pagination-pages'})
        pagination_list_a = pagination_list[0].find_all('a')
        
        pagination_list_a_len = len(pagination_list_a)
        paginattion_list_a_text = pagination_list_a[pagination_list_a_len-1].text
        
        #replace string
        paginattion_list_a_text = paginattion_list_a_text.replace(',', '')
        paginattion_list_a_text = paginattion_list_a_text.replace('.', '')
        
        #extract only numbers from the text
        paginattion_list_a_text_num = re.findall(r'\d+', paginattion_list_a_text)
        print('last page: ', paginattion_list_a_text_num[0])
        paginattion_list_a_text_num = int(paginattion_list_a_text_num[0])
    except:
        paginattion_list_a_text_num = 1
        

    links_table = pd.DataFrame(columns = ['link'])

    for i in range(1, paginattion_list_a_text_num + 1):
        one_link =  base_link+ str(i) + '/'
        links_table = links_table.append({'link': one_link}, ignore_index = True)
    
    driver.close()

    return(links_table)

In [3]:
base_link_table_all = create_links_table()



Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)


last page:  2340


In [4]:
base_link_table_all

Unnamed: 0,link
0,https://www.funda.nl/en/koop/heel-nederland/p1/
1,https://www.funda.nl/en/koop/heel-nederland/p2/
2,https://www.funda.nl/en/koop/heel-nederland/p3/
3,https://www.funda.nl/en/koop/heel-nederland/p4/
4,https://www.funda.nl/en/koop/heel-nederland/p5/
...,...
2335,https://www.funda.nl/en/koop/heel-nederland/p2...
2336,https://www.funda.nl/en/koop/heel-nederland/p2...
2337,https://www.funda.nl/en/koop/heel-nederland/p2...
2338,https://www.funda.nl/en/koop/heel-nederland/p2...


In [5]:
rotterdam_link_table_all = create_links_table(base_link = 'https://www.funda.nl/en/koop/rotterdam/p', sleep_time = 3)



Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)


last page:  84


In [6]:
def get_links_one_page(one_link_page = 'https://www.funda.nl/en/koop/heel-nederland/p1/', sleep_time = 1):
    
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2} # this is to not load images
    chrome_options.add_experimental_option("prefs", prefs)

    #start a driver
    driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)
    driver.get(one_link_page)
    time.sleep(sleep_time)

    one_page_source = BeautifulSoup(driver.page_source, 'html.parser')
    
    search_result_containers = one_page_source.find_all('div', {'class': 'search-result-content-inner'})
    
    links_one_page = []

    i = 1

    try:
        for search_result_container in search_result_containers:
            resultlist = search_result_container.find_all('a', {'data-object-url-tracking': 'resultlist'})
            resultlist_link = resultlist[0]['href']
            links_one_page.append(resultlist_link)
            i = i + 1
    except:
        pass

    driver.close()

    return(links_one_page)

In [7]:
def get_links_multiple_page(base_link_list = base_link_table_all['link'][:3], sleep_time_per_page = 1, save_to_csv = True):

    df = pd.DataFrame()

    for link in tqdm(base_link_list):

        links_one_page = get_links_one_page(one_link_page = link, sleep_time = sleep_time_per_page)

        #create a dataframe with a column
        df_links = pd.DataFrame(links_one_page, columns = ['ad_link'])
        df_links['page_link'] = link #via this we can see which make and model the links belong to
        #datetime string
        now = datetime.now() 
        datetime_string = str(now.strftime("%Y%m%d_%H%M%S"))
        df_links['link_download_date_time'] = datetime_string

        #save the dataframe if save_to_csv is True
        if save_to_csv:
            #check if folder exists and if not create it
            if not os.path.exists('data/ad_links'):
                os.makedirs('data/ad_links')

            df_links.to_csv(str('data/ad_links/links_on_one_page_df' + datetime_string + '.csv'), index = False)

        df = pd.concat([df, df_links], ignore_index=True)
    
     #drop duplicates
    df = df.drop_duplicates()

    return(df)


In [8]:
def get_buy_data(base_link = 'https://www.funda.nl/en/koop/rotterdam/p', sleep_time = 3, save_to_csv = True, max_len = 10):

    base_table = create_links_table(base_link = base_link, sleep_time = 3)

    # check if max_len is integer
    if isinstance(max_len, int):
        ads_link_data = get_links_multiple_page(base_link_list = base_table['link'][:max_len], save_to_csv=save_to_csv)
    elif max_len == 'all':
        ads_link_data = get_links_multiple_page(base_link_list = base_table['link'], save_to_csv=save_to_csv)
    else:
        add_link_data = "max_len is not an integer or 'all'"

    return(ads_link_data)   
    

In [9]:
rotterdam_link_data = get_buy_data(max_len = 'all')



Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)


last page:  84


  0%|          | 0/84 [00:00<?, ?it/s]

Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  1%|          | 1/84 [00:06<08:52,  6.42s/it]

Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  2%|▏         | 2/84 [00:20<15:10, 11.10s/it]

Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  4%|▎         | 3/84 [00:26<11:42,  8.68s/it]

Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  5%|▍         | 4/84 [00:36<12:13,  9.16s/it]

Current google-chrome version is

In [10]:
rotterdam_link_data

Unnamed: 0,ad_link,page_link,link_download_date_time
0,https://www.funda.nl/en/koop/rotterdam/huis-42...,https://www.funda.nl/en/koop/rotterdam/p1/,20220124_234315
1,https://www.funda.nl/en/koop/rotterdam/huis-42...,https://www.funda.nl/en/koop/rotterdam/p1/,20220124_234315
2,https://www.funda.nl/en/koop/rotterdam/huis-42...,https://www.funda.nl/en/koop/rotterdam/p1/,20220124_234315
3,https://www.funda.nl/en/koop/rotterdam/huis-42...,https://www.funda.nl/en/koop/rotterdam/p1/,20220124_234315
4,https://www.funda.nl/en/koop/rotterdam/huis-42...,https://www.funda.nl/en/koop/rotterdam/p1/,20220124_234315
...,...,...,...
1244,https://www.funda.nl/en/koop/rotterdam/huis-87...,https://www.funda.nl/en/koop/rotterdam/p83/,20220124_235855
1245,https://www.funda.nl/en/koop/rotterdam/apparte...,https://www.funda.nl/en/koop/rotterdam/p84/,20220124_235904
1246,https://www.funda.nl/en/koop/rotterdam/apparte...,https://www.funda.nl/en/koop/rotterdam/p84/,20220124_235904
1247,https://www.funda.nl/en/koop/rotterdam/apparte...,https://www.funda.nl/en/koop/rotterdam/p84/,20220124_235904
