In [2]:
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm

# Data fetch 

In [3]:
'''
Get the links to the objects
'''
links = []

for page_n in tqdm(range(50)):
    http = requests.get("https://www.hemnet.se/salda/bostader?page={}".format(page_n))
    soup = BeautifulSoup(http.content)

    for a in soup.find_all('a', href=True):
        res = a['href']
        if "https" in res and "salda" in res:
            links.append(res)

len(links)

100%|██████████| 50/50 [00:39<00:00,  1.25it/s]


In [5]:
'''
Save the result to a text file
'''
with open("data/links.txt", "w") as fn:
    fn.writelines(links)

In [383]:
for link in tqdm(links):
    data = requests.get(link)
    soup = BeautifulSoup(data.text)
    mydivs = soup.findAll("div", {"class": "sold-property__map js-listing-map-sold"})
    try:
        jdata = json.loads(mydivs[0]["data-initial-data"])
        with open("data/objects/{}.txt".format(jdata['listing']['id']), "w") as fn:
            fn.writelines(data.text)
    except Exception as e:
        print(e)

 10%|█         | 261/2500 [02:11<18:11,  2.05it/s]

list index out of range


 12%|█▏        | 290/2500 [02:35<50:06,  1.36s/it]

list index out of range


 18%|█▊        | 443/2500 [03:57<12:49,  2.67it/s]

list index out of range


 37%|███▋      | 919/2500 [08:12<16:09,  1.63it/s]

list index out of range


100%|██████████| 2500/2500 [22:14<00:00,  1.87it/s]


# Data processing 

In [384]:
'''
Parse files into Json
'''

from os import listdir
from os.path import isfile, join

objects = []

for f in tqdm(listdir("data/objects")):
    with open("data/objects/{}".format(f), "r") as f:
        data = f.read()
    soup = BeautifulSoup(data)
    mydivs = soup.findAll("div", {"class": "sold-property__map js-listing-map-sold"})
    jdata = json.loads(mydivs[0]["data-initial-data"])
    objects.append(jdata)

objects[2]

100%|██████████| 2446/2446 [01:41<00:00, 24.07it/s]


{'status': 'sold',
 'listing': {'id': 1204486,
  'coordinate': [63.1817274331694, 14.639372293351753],
  'address': 'Rådhusgatan 28 A',
  'typeSummary': 'Bostadsrättslägenhet',
  'iconName': 'bostadsratt',
  'price': "<span class='sold-for'>Slutpris 1\xa0900\xa0000 kr</span>",
  'price_per_area': '20\xa0541 kr/m²',
  'fee': '4\xa0948 kr/mån',
  'sale_date': 'Såld 2020-06-15',
  'asked_price': 'Begärt pris 1\xa0950\xa0000 kr',
  'rooms': '3 rum',
  'living_space': '92,5 m²',
  'supplemental_area': None,
  'url': '/salda/lagenhet-3rum-centralt-ostersunds-kommun-radhusgatan-28-a-1204486',
  'land_area': None},
 'assets_prefix': '',
 'map_url': 'https://maps.googleapis.com/maps/api/js?libraries=geometry,places&v=3.34&client=gme-hemnetservicehns&region=SE&language=sv'}

In [385]:
import pandas as pd

df = pd.DataFrame.from_dict([t['listing'] for t in objects])
df.head()

Unnamed: 0,id,coordinate,address,typeSummary,iconName,price,price_per_area,fee,sale_date,asked_price,rooms,living_space,supplemental_area,url,land_area
0,1202014,"[59.6238363159342, 16.570181550355525]",Haga parkgata 9B,Bostadsrättslägenhet,bostadsratt,<span class='sold-for'>Slutpris 1 000 000 kr</...,20 000 kr/m²,3 833 kr/mån,Såld 2020-06-11,Begärt pris 995 000 kr,2 rum,50 m²,,/salda/lagenhet-2rum-haga-vasteras-kommun-haga...,
1,1203316,"[59.3191730260472, 18.03829415619227]",Heleneborgsgatan 5C,Bostadsrättslägenhet,bostadsratt,<span class='sold-for'>Slutpris 6 350 000 kr</...,96 212 kr/m²,3 944 kr/mån,Såld 2020-06-12,Begärt pris 5 790 000 kr,3 rum,66 m²,,/salda/lagenhet-3rum-sodermalm-hogalid-stockho...,
2,1204486,"[63.1817274331694, 14.639372293351753]",Rådhusgatan 28 A,Bostadsrättslägenhet,bostadsratt,<span class='sold-for'>Slutpris 1 900 000 kr</...,20 541 kr/m²,4 948 kr/mån,Såld 2020-06-15,Begärt pris 1 950 000 kr,3 rum,"92,5 m²",,/salda/lagenhet-3rum-centralt-ostersunds-kommu...,
3,1201379,"[59.43984696283136, 18.07367567305071]","Kometvägen 35, vån 3",Bostadsrättslägenhet,bostadsratt,<span class='sold-for'>Slutpris 2 800 000 kr</...,34 568 kr/m²,4 663 kr/mån,Såld 2020-06-10,Begärt pris 2 995 000 kr,3 rum,81 m²,,/salda/lagenhet-3rum-grindtorp-taby-kommun-kom...,
4,1201494,"[59.29511755942469, 18.10640325344144]",Ulricehamnsvägen 4,Bostadsrättslägenhet,bostadsratt,<span class='sold-for'>Slutpris 2 940 000 kr</...,56 538 kr/m²,2 904 kr/mån,Såld 2020-06-10,Begärt pris 2 495 000 kr,2 rum,52 m²,,/salda/lagenhet-2rum-stockholms-kommun-ulriceh...,


In [386]:
def parse_num(x, n_remove_suff=0):
    if x == None: 
        return None
    t = "".join([v for v in x if v.isnumeric() or v==","])
    t = t.replace(",", ".")
    if n_remove_suff:
        t = t[:-n_remove_suff]
    return float(t)

In [387]:
df_parsed = pd.DataFrame()

In [388]:
df_parsed["price_per_area"] = df.price_per_area.apply(lambda x: parse_num(x, 1))

In [389]:
df_parsed["rooms"] = df.rooms.apply(parse_num)

In [390]:
df_parsed["fee"] = df.fee.apply(parse_num)

In [391]:
df_parsed["living_space"] = df.living_space.apply(lambda x: parse_num(x, 1))

In [392]:
df_parsed["supplemental_area"] = df.supplemental_area.apply(lambda x: parse_num(x, 1))

In [393]:
df_parsed["price"] = df.price.apply(parse_num)

In [394]:
df_parsed["asked_price"] = df.asked_price.apply(parse_num)

In [395]:
df_parsed["land_area"] = df.land_area.apply(lambda x: parse_num(x, 1))

In [396]:
df_parsed["longitude"] = df.coordinate.apply(lambda x: x[0])
df_parsed["latitude"] = df.coordinate.apply(lambda x: x[1])

In [409]:
df_parsed["typeSummary"] = df.typeSummary

In [410]:
'''
Date related
'''

df_parsed["year"] = df.sale_date.apply(lambda x: x[5:].split("-")[0])
df_parsed["month"] = df.sale_date.apply(lambda x: x[5:].split("-")[1])
df_parsed["day"] = df.sale_date.apply(lambda x: x[5:].split("-")[2])

In [411]:
def contains_num(s):
    return any(filter(lambda x: x.isnumeric(), s))

In [412]:
def floor(x):
    try:
        x = x.lower()
        if "vån " in x:
            last = x.split("vån")[-1]
            return parse_num(last)
        if x[-2:] == "tr":
            return parse_num(x[-4:-2])
    except:
        return 0
    return 0

df_parsed["floor"] = df.address.apply(floor)

In [414]:
df_parsed.describe()

Unnamed: 0,price_per_area,rooms,fee,living_space,supplemental_area,price,asked_price,land_area,longitude,latitude,floor
count,2390.0,2375.0,1433.0,2392.0,710.0,2446.0,2430.0,1000.0,2446.0,2446.0,2446.0
mean,36985.372385,3.403368,3728.35799,87.042559,41.252113,2864402.0,2703596.0,4757.62,58.889279,15.90827,0.386263
std,25616.539904,1.720102,1322.843515,47.306871,39.878895,2184230.0,2091508.0,40386.77,1.86817,2.524556,1.845948
min,982.0,1.0,0.0,0.0,0.0,55000.0,75000.0,103.0,55.346335,11.176771,0.0
25%,19672.25,2.0,2795.0,55.0,9.0,1495000.0,1375000.0,651.5,57.731481,13.224915,0.0
50%,30909.0,3.0,3655.0,77.75,30.0,2420000.0,2250000.0,1092.5,59.280953,16.582154,0.0
75%,47673.75,4.0,4580.0,110.0,65.0,3700000.0,3495000.0,1836.25,59.435408,18.004513,0.0
max,418750.0,20.0,9321.0,560.0,335.0,25300000.0,23000000.0,1007140.0,67.854406,24.136828,56.0


# Save dataset 

In [416]:
df_parsed.to_csv("data/dataset.csv")

In [404]:
# from pandas_profiling import ProfileReport

# ProfileReport(df_parsed, title="Pandas Profiling Report")