In [79]:
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
import http.client

In [80]:
class Crawler:
    def __init__(self, listings_to_fetch=50):
        self.links = None
        self.run(listings_to_fetch)
    
    def run(self, listings_to_fetch=50):
        print("Crawling Hemnet")
        self.links = []
        print("Fetching listings...[1/2]")
        self.fetch_links(min(50, listings_to_fetch))
        self.save_links()
        print("Collecting objects...[2/2]")
        self.fetch_objects_from_links()

    def fetch_links(self, n_calls=50):
        """
        Get links to Hemnet listings through recurrent GET
        Multiple calls are needed since only a limited number of objects are visible in every request
        """
                
        links = []
        for page_n in tqdm(range(n_calls)):
            conn = http.client.HTTPSConnection("www.hemnet.se")
            conn.request("GET", "/salda/bostader?page=".format(page_n), '', {})
            res = conn.getresponse()
            response = res.read().decode("utf-8")
            soup = BeautifulSoup(response, features="html.parser")
            for a in soup.find_all('a', href=True):
                res = a['href']
                if "https" in res and "salda" in res:
                    links.append(res)
        self.links = links

    def save_links(self, fname="resources/links.txt"):
        """
        Save the result to a text file
        """
        with open(fname, "w") as fn:
            for line in self.links:
                fn.write(line+"\n")

    def read_links(self, fname="resources/links.txt"):
        """
        Save the result to a text file
        """
        with open(fname, "r") as fn:
            self.links = fn.read().split("\n")
        return self.links

    def fetch_objects_from_links(self):
        """
        Save the result to a text file
        """
        if not self.links:
            self.read_links()
            
        for link in tqdm(self.links):
            conn = http.client.HTTPSConnection("www.hemnet.se")
            conn.request("GET", link, '', {})
            res = conn.getresponse()
            data = res.read().decode("utf-8")
            soup = BeautifulSoup(data, features="html.parser")
            mydivs = soup.findAll("div", {"class": "sold-property__map js-listing-map-sold"})
            try:
                jdata = json.loads(mydivs[0]["data-initial-data"])
                with open("resources/objects/{}.html".format(jdata['listing']['id']), "w") as fn:
                    fn.writelines(data)
            except Exception as e:
                print(e)

In [81]:
c = Crawler(2)

  0%|          | 0/2 [00:00<?, ?it/s]

Crawling Hemnet
Fetching listings...[1/2]


100%|██████████| 2/2 [00:01<00:00,  1.37it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Collecting objects...[2/2]


  8%|▊         | 8/100 [00:05<00:58,  1.57it/s]


KeyboardInterrupt: 

In [68]:
c.links[1]

'https://www.hemnet.se/salda/villa-5rum-gryt-valdemarsviks-kommun-prastgardsvagen-11-ekhyddan-1346513'

In [75]:
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from os import listdir
import pandas as pd


class Parser:
    def __init__(self, to_csv="resources/result.csv"):
        print("Parsing HTML...")
        self.df = None
        self.parse_html()
        clean = self.clean_data()
        clean.to_csv(to_csv, index=False)
        print("Saved result to: " + to_csv)

    def parse_html(self):
        """
        Parse files into Json
        """
        objects = []
        for f in tqdm(listdir("resources/objects")):
            if ".html" in f:
                with open("resources/objects/{}".format(f), "r") as f:
                    data = f.read()
                soup = BeautifulSoup(data, features="html.parser")
                mydivs = soup.findAll("div", {"class": "sold-property__map js-listing-map-sold"})
                jdata = json.loads(mydivs[0]["data-initial-data"])
                objects.append(jdata)
        self.df = pd.DataFrame.from_dict([t['listing'] for t in objects])

    @staticmethod
    def parse_num(x, n_remove_suff=0):
        if x == None:
            return None
        t = "".join([v for v in x if v.isnumeric() or v == ","])
        t = t.replace(",", ".")
        if n_remove_suff:
            t = t[:-n_remove_suff]
        return float(t)

    @staticmethod
    def contains_num(s):
        return any(filter(lambda x: x.isnumeric(), s))

    @staticmethod
    def floor(x):
        try:
            x = x.lower()
            if "vån " in x:
                last = x.split("vån")[-1]
                return Parser.parse_num(last)
            if x[-2:] == "tr":
                return Parser.parse_num(x[-4:-2])
        except:
            return 0
        return 0

    def clean_data(self):
        df = self.df
        df_parsed = pd.DataFrame()
        df_parsed["price_per_area"] = df.price_per_area.apply(lambda x: Parser.parse_num(x, 1))
        df_parsed["rooms"] = df.rooms.apply(Parser.parse_num)
        df_parsed["fee"] = df.fee.apply(Parser.parse_num)
        df_parsed["living_space"] = df.living_space.apply(lambda x: Parser.parse_num(x, 1))
        df_parsed["supplemental_area"] = df.supplemental_area.apply(lambda x: Parser.parse_num(x, 1))
        df_parsed["price"] = df.price.apply(Parser.parse_num)
        df_parsed["asked_price"] = df.asked_price.apply(Parser.parse_num)
        df_parsed["land_area"] = df.land_area.apply(lambda x: Parser.parse_num(x, 1))
        df_parsed["longitude"] = df.coordinate.apply(lambda x: x[0])
        df_parsed["latitude"] = df.coordinate.apply(lambda x: x[1])
        df_parsed["typeSummary"] = df.typeSummary
        df_parsed["year"] = df.sale_date.apply(lambda x: x[5:].split("-")[0])
        df_parsed["month"] = df.sale_date.apply(lambda x: x[5:].split("-")[1])
        df_parsed["day"] = df.sale_date.apply(lambda x: x[5:].split("-")[2])
        df_parsed["floor"] = df.address.apply(Parser.floor)
        return df_parsed

In [76]:
p = Parser()

 33%|███▎      | 3/9 [00:00<00:00, 28.62it/s]

Parsing HTML...


100%|██████████| 9/9 [00:00<00:00, 36.49it/s]

Saved result to: resources/result.csv





In [78]:
import pandas
df = pandas.read_csv("resources/result.csv")
df

Unnamed: 0,price_per_area,rooms,fee,living_space,supplemental_area,price,asked_price,land_area,longitude,latitude,typeSummary,year,month,day,floor
0,23397.0,4.0,,156.0,,3650000.0,3495000.0,574.0,56.143922,12.939974,Villa,2021,3,15,0.0
1,49020.0,2.0,3044.0,51.0,,2500000.0,2195000.0,,59.232562,17.989882,Bostadsrättslägenhet,2021,3,15,2.0
2,39873.0,5.0,,118.0,15.0,4705000.0,3495000.0,2153.0,58.183669,16.803308,Villa,2021,3,15,0.0
3,51852.0,2.5,3669.0,67.5,,3500000.0,3090000.0,,57.69547,11.939396,Bostadsrättslägenhet,2021,3,15,0.0
4,23618.0,3.0,4011.0,76.0,,1795000.0,1695000.0,,59.304535,15.236899,Bostadsrättslägenhet,2021,3,15,0.0
5,14205.0,3.0,4732.0,88.0,,1250000.0,1100000.0,,58.423726,15.582805,Bostadsrättslägenhet,2021,3,15,0.0
6,29917.0,2.0,2786.0,60.0,,1795000.0,1795000.0,,59.639302,17.079919,Bostadsrättslägenhet,2021,3,15,0.0
7,36458.0,5.0,,120.0,14.0,4375000.0,3795000.0,363.0,59.496364,18.330264,Parhus,2021,3,15,0.0
