In [1]:
# Toolbox
import tools as tools

# Standard imports 
import pandas as pd


# OS and time packages 
import time
from pathlib import Path
import tqdm
import concurrent.futures

# HTML and text processing 
import json

Header (state non-commercial/academic intentions)

In [2]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

Meta data

In [3]:
url = 'https://api.boliga.dk/api/v2/sold/search/results?pageSize=2000&page=1&salesDateMin=2000&salesDateMax=2008&propertytype=1&saleType=1&sort=date-d&buildYearMax=2005'
bbr_test_url = 'https://api.boliga.dk/api/v2/bbrinfo/bbr?id=69cd6d3d-e858-43aa-b530-bd20f132e3b8'
output=tools.get_json(url=url, header=header)
output['meta']

{'pageIndex': 1,
 'pageSize': 2000,
 'totalCount': 441678,
 'totalPages': 221,
 'minPage': 1,
 'maxPage': 6,
 'countFrom': 1,
 'countTo': 2000}

In [4]:
output['results']

[{'estateId': 0,
  'address': 'Herningvej 15',
  'zipCode': 6880,
  'price': 650000,
  'soldDate': '2008-12-30T23:00:00.000Z',
  'propertyType': 1,
  'saleType': 'Alm. Salg',
  'sqmPrice': 6435.6436,
  'rooms': 3.0,
  'size': 101,
  'buildYear': 1946,
  'change': 0.0,
  'guid': '7213AA8E-A339-4363-B0AE-4EA6B7622E3E',
  'latitude': 55.88893,
  'longitude': 8.790987,
  'municipalityCode': 760,
  'estateCode': 6686,
  'city': 'Tarm',
  'groupKey': None,
  'canGetVR': True,
  'bfEnr': 8300899},
 {'estateId': 0,
  'address': 'Platanvej 1',
  'zipCode': 4760,
  'price': 1300000,
  'soldDate': '2008-12-30T23:00:00.000Z',
  'propertyType': 1,
  'saleType': 'Alm. Salg',
  'sqmPrice': 10000.0,
  'rooms': 6.0,
  'size': 130,
  'buildYear': 1950,
  'change': 0.0,
  'guid': 'B80FC300-0FCC-41AE-A400-2CABC103A3EC',
  'latitude': 55.016163,
  'longitude': 11.897488,
  'municipalityCode': 390,
  'estateCode': 20585,
  'city': 'Vordingborg',
  'groupKey': None,
  'canGetVR': True,
  'bfEnr': 5393724},
 

Make list of urls

In [5]:
list_of_url = []

total_pages = output['meta']['totalPages']

for page in range(1, total_pages+1):
    url = f'https://api.boliga.dk/api/v2/sold/search/results?pageSize=2000&page={page}&salesDateMin=2000&salesDateMax=2008&propertytype=1&saleType=1&sort=date-d&buildYearMax=2005'
    list_of_url.append(url)

In [None]:
errors = []

def process_url(id_url_pair):
    id_, url = id_url_pair
    try:
        time.sleep(0.5)
        out = tools.get_json(url, header)
        with open(f'data/boliga/boliga_{id_}.json', "w") as fp:
            json.dump(out['results'],fp) 
    except:
        print(f'Error encountered on url {url}')
        errors.append(url)
        pd.DataFrame(errors).to_csv
        return id_, None

id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_url)]

## NB! Do not overload Boliga's servers - scrape at odd hours and/or adjust max_workers accordingly!
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    results = list(tqdm.tqdm(executor.map(process_url, id_url_pairs), total=len(id_url_pairs)))

100%|██████████| 396/396 [04:28<00:00,  1.48it/s]


## Merge and clean data from Boliga

In [None]:
# data_dir = Path('data/boliga')
# full_df1 = pd.concat(
#     pd.DataFrame(json.load(open(json_file)))
#     for json_file in data_dir.glob('*.json')
# )

# df=full_df1.reset_index(drop=True)
# df['soldDate']=pd.to_datetime(df['soldDate'])
# df['year']=df.soldDate.dt.year
# df['month']=df.soldDate.dt.month
# df['week']=df.soldDate.dt.weekday
# df['time_q']=pd.PeriodIndex(df['soldDate'], freq='Q')

# df=df.drop(columns=['change'])

## Read BBR urls to scrape

In [None]:
list_of_bbr_urls = []
list_of_guids = list(df['guid'].unique())

for i,guid in enumerate(list_of_guids):
    bbr_url = f'https://api.boliga.dk/api/v2/bbrinfo/bbr?id={guid}'
    list_of_bbr_urls.append(bbr_url)

pd.DataFrame(list_of_bbr_urls, columns=['bbr_url']).to_parquet('data/bbr_ids_scraper.pq')

In [3]:

temp_df = pd.read_parquet('data/bbr_ids_scraper.pq')
list_of_bbr_urls = list(temp_df['bbr_url'])
len(list_of_bbr_urls)

519604

In [4]:
errors = []

def process_url(id_url_pair):
    id_, url = id_url_pair
    try:
        time.sleep(0.5)
        out = tools.get_json(url, header)
        with open(f'data/bbr/bbr{id_}.json', "w") as fp:
            json.dump(out,fp) 
    except:
        print(f'Error encountered on url {url}')
        errors.append(url)
        pd.DataFrame(errors).to_csv
        return id_, None

id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_bbr_urls[:100])]

with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    results = list(tqdm.tqdm(executor.map(process_url, id_url_pairs), total=len(id_url_pairs)))

100%|██████████| 100/100 [00:10<00:00,  9.65it/s]
