# BoligPortal Scraper (with Selenium)

### Relevant Libraries

In [None]:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup as bs

from datetime import datetime
import os

import pandas as pd
import numpy as np

### ProgressBar Function to follow the scraper's progress

In [None]:
from functions import printProgressBar

### Selenium start-up

In [None]:
# <<< Insert your starting URL in the line below >>>

# First page of ads in Kbh Kommune and Frederiksberg Kommune
start_url='https://www.boligportal.dk/find?placeIds=15%2C365&minRentalPeriod=2' 

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(executable_path="/Users/Francesco/Documents/Learn/chromedriver", options=options)
driver.get(start_url)
sleep(1)

### Creating a results folder

In [None]:
# Create folder for the .csv results

wd = "/Users/Francesco/Documents/Learn/Other/Franz/Boligportal"

now = datetime.now()
current_time = now.strftime("%d%m%y")
dirName = current_time

try:
    # Create target Directory
    os.mkdir(os.path.join(wd, dirName))
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

### URLs to scrape

In [None]:
# Extract the number of pages from the last Pagination button at the bottom of the page

soup = bs(driver.page_source, 'html')
num_pages = soup.find_all('a', {'class': 'PaginationControls__page'})[-1].getText() 
num_pages = int(num_pages)

# Extract number of ads
num_ads = num_pages * 18

In [None]:
# Create a range of numbers to generate the URLs to scrape
start_record = np.arange(0, num_ads, 18)

In [None]:
# Generate URLs to scrape, corresponding to page numbers (1, 2, 3, etc..)

urls = []

for i in start_record:
    url = start_url + '&startRecord={}'.format(i)
    urls.append(url)

### Scraping

In [None]:
# Selenium will go through each page (from the URLs list) and extract the AdCardWrappers
# Then, it will extract each AdCard's information

data = []

for url in urls:
    
    driver.get(url)
    sleep(1)
    soup = bs(driver.page_source, 'html')
    
    ads_buttons = soup.find_all('div', {'class': 'AdCardWrapper'})
    
    for ad in ads_buttons:
    
        try:
            title = ad.find('div', {'class': 'AdCard__title'}).getText() 
        except:
            title = -1

        try:
            price = ad.find('div', {'class': 'AdCard__price'}).getText() 
        except:
            price = -1

        try:
            location = ad.find('div', {'class': 'AdCard__location'}).getText() 
        except:
            location = -1

        try:
            date = ad.find('div', {'class': 'AdCard__date'}).getText() 
        except:
            date = -1

        try:
            link = 'boligportal.dk' + ad.find('a', {'itemprop' : 'url'}).get('href')
        except:
            link = -1

        try:
            description = ad.find('div', {'class': 'AdCard__description'}).getText() 
        except:
            description = -1


        data.append({"Title": title,
                     "Price": price,
                     "Location": location,
                     "When": date,
                     "Description": description,
                     "URL": link})

    printProgressBar(urls.index(url), len(urls), prefix = 'Scraping...')

In [None]:
# Scraped data preview
pd.DataFrame(data)

In [None]:
# Create a dataframe with the data

df = pd.DataFrame(data)

In [None]:
#df = df.drop_duplicates(subset=None, keep='first', inplace=False)

### Functions to clean/manipulate the dataframe

In [None]:
def get_rooms(row):
    
    ''' Function to use with df.apply. Creates a Rooms column by extracting the number of rooms from the Ad title '''
    
    if 'Værelse' in row.Title:
        return 0
    
    else:
        try:
            if len([int(s) for s in row.Title.split() if s.isdigit()]) == 1:
                return 1
            else:
                return [int(s) for s in row.Title.split() if s.isdigit()][0]
        except:
            return -1

In [None]:
def get_m2(row):
    
    ''' Function to use with df.apply. Creates a m2 column by extracting the squared meters from the Ad title '''
    
    try:
        
        try:
            return [int(s) for s in row.Title.split() if s.isdigit()][1]
        except:
            return [int(s) for s in row.Title.split() if s.isdigit()][0]
    
    except: 
        return -1

In [None]:
def en_et_to_one(column):
    
    '''To use on the column containing the age of the Ad. Replaces 'en and 'et' with 1, Fremhævet with 0, removes 'siden' '''
    
    column = column.str.replace('Fremhævet', '0')
    column = column.str.replace('siden', '')
    column = column.str.replace('en','1', 1)
    column = column.str.replace('et','1', 1)
    
    return column

In [None]:
def posted_x_days_ago(row):
    
    ''' Extracts age of post in terms of days '''

    if 'dag' in row.When:
        return [int(s) for s in row.When.split() if s.isdigit()][0]
    
    elif 'måned' in row.When:
        return [int(s) for s in row.When.split() if s.isdigit()][0] * 30
    
    elif 'år' in row.When:
        return [int(s) for s in row.When.split() if s.isdigit()][0] * 365

    else:
        return 0

### Dataframe cleaning/manipulation

In [None]:
df['Rooms'] = df.apply(lambda row: get_rooms(row), axis = 1) 

In [None]:
df['m2'] = df.apply(lambda row: get_m2(row), axis = 1)

In [None]:
df['Price'] = df['Price'].apply(lambda row: row.replace(",-", ""))

In [None]:
df[['Neighbourhood','Street']] = df.Location.str.split(", ", expand=True)
#df = df.drop('Empty', axis = 1)

In [None]:
df.When = en_et_to_one(df.When)
df['Posted days ago'] = df.apply(lambda row: posted_x_days_ago(row), axis = 1)
df = df.drop('When', axis = 1)

In [None]:
df = df[['Title', 'Price', 'Rooms', 'm2', 'Neighbourhood', 'Street', 'Posted days ago', 'Description', 'URL']]

In [None]:
df = df.sort_values(by = 'Posted days ago', ascending = True)

### Export

In [None]:
df.to_csv(os.path.join(wd, dirName,'BoligPortal_KPH_{}.csv'.format(datetime.now().strftime("%d%m%y"))), index = True, header = True)