In [10]:
import requests
from bs4 import BeautifulSoup
import os
import time
import logging
import random
from tqdm import tqdm


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [8]:

# configuring logging
logging.basicConfig(filename='scraping.log', level=logging.INFO, 
                    format='%(asctime)s:%(levelname)s:%(message)s')

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0",
    "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.116 Mobile Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (iPad; CPU OS 14_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
    ]

def scrape_listings(sleep_time=1):
    directory = 'hemnet_listings'
    if not os.path.exists(directory):
        os.makedirs(directory)

    for street_id in tqdm(range(475078, 476356), desc='Processing streets'):
        for page_number in range(1, 51):
            file_path = os.path.join(directory, f'street_{street_id}_page_{page_number}.html')

            if not os.path.exists(file_path):
                url = f"https://www.hemnet.se/salda/bostader?item_types%5B%5D=bostadsratt&location_ids%5B%5D={street_id}&page={page_number}"
                
                headers = {
                    'User-Agent': random.choice(USER_AGENTS)
                }
                
                try:
                    response = requests.get(url, headers=headers)

                    if response.status_code == 200:
                        html_content = response.text

                        with open(file_path, 'w', encoding='utf-8') as file:
                            file.write(html_content)
                            logging.info(f'Successfully wrote data to {file_path}')

                    else:
                        logging.error(f'Failed to retrieve data for street ID {street_id} page {page_number}, status code: {response.status_code}')
                except requests.RequestException as e:
                    logging.error(f'Request for street ID {street_id} page {page_number} failed due to: {e}')
            
            time.sleep(sleep_time)

In [3]:
scrape_listings()

Processing streets: 100%|██████████| 1278/1278 [28:09:29<00:00, 79.32s/it]  


In [None]:
# 475078 – Centralplan, Stockholms Kommun
# 476355 - Palmfeltsvägen, Stockholms kommun

In [13]:
def parse_html(file_path):

    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    apartments = soup.find_all('a', class_='hcl-card')

    return apartments

In [2]:
def extract_apartment_data(apartment):

    # location
    location_tag = apartment.find('div', {'data-testid': 'location-parsed-text'})
    location = location_tag.get_text(strip=True) if location_tag else None
    
    # address
    address_tag = apartment.find('h2', class_='hcl-card__title')
    address = address_tag.get_text(strip=True) if address_tag else None

    # size, rooms and monthly fee
    size_rooms_fee_container = apartment.find('div', class_='hcl-flex--container hcl-flex--gap-2 hcl-flex--justify-space-between hcl-flex--md-justify-flex-start')
    size = None
    rooms = None
    monthly_fee = None
    if size_rooms_fee_container:
        size_rooms = size_rooms_fee_container.find_all('p', class_='hcl-text hcl-text--medium')
        if len(size_rooms) >= 2:
            size = size_rooms[0].get_text(strip=True)
            rooms = size_rooms[1].get_text(strip=True)
        
        monthly_fee_tag = size_rooms_fee_container.find('span', class_='hcl-text')
        monthly_fee = monthly_fee_tag.get_text(strip=True) if monthly_fee_tag else None
    
    # elevator
    elevator_tag = apartment.find('span', class_='hcl-label hcl-label--feature hcl-label--elevator hcl-label--on-white-background')
    elevator = True if elevator_tag else False

    # balcony
    balcony_tag = apartment.find('span', class_='hcl-label hcl-label--feature hcl-label--balcony hcl-label--on-white-background')
    balcony = True if balcony_tag else False
    
    # selling price attributes
    selling_price_container = apartment.find('div', class_='SellingPriceAttributes_contentWrapper__VaxX9')
    
    selling_price = None
    percentage_increase = None
    square_meter_price = None
    if selling_price_container:

        selling_price_tag = selling_price_container.find('span', class_='hcl-text hcl-text--medium')
        selling_price = selling_price_tag.get_text(strip=True) if selling_price_tag else None
        
        spans = selling_price_container.find_all('span', class_='hcl-text hcl-text--medium')
        percentage_increase = spans[1].get_text(strip=True) if len(spans) > 1 else None
        
        square_meter_price_tag = selling_price_container.find('p', class_='hcl-text')
        square_meter_price = square_meter_price_tag.get_text(strip=True) if square_meter_price_tag else None

    # date of sale
    date_sale_tag = apartment.find('span', class_='hcl-label hcl-label--state hcl-label--sold-at')
    date_sale = date_sale_tag.get_text(strip=True) if date_sale_tag else None

    apartment_data = {
        'location': location,
        'address': address,
        'size': size,
        'rooms': rooms,
        'monthly_fee': monthly_fee,
        'elevator': elevator,
        'balcony': balcony,
        'selling_price': selling_price,
        'percentage_increase': percentage_increase,
        'square_meter_price': square_meter_price,
        'date_sale': date_sale
    }
    
    return apartment_data

In [18]:
def html_to_df(file_path):

    data = []
    listings = parse_html(file_path)

    for apartment in listings:
        apartment_data = extract_apartment_data(apartment)
        data.append(apartment_data)

    df = pd.DataFrame(data)

    return df

In [26]:
test_df = html_to_df('/Users/janstein/Documents/DEV/Portfolio/hemnet_prices/hemnet_listings/street_475079_page_1.html')
test_df

Unnamed: 0,location,address,size,rooms,monthly_fee,elevator,balcony,selling_price,percentage_increase,square_meter_price,date_sale
0,,,,,,False,False,,,,
1,,,,,,False,False,,,,
2,"City - Vasastan - Norrmalm, Stockholms kommun",Gamla Brogatan 25,70 m²,2 rum,4 721 kr/mån,True,False,Slutpris7 200 000 kr,+4 %,102 857 kr/m²,Såld 14 nov. 2022
3,"Norrmalm/City, Stockholms kommun",Gamla Brogatan 25,114 m²,"3,5 rum",6 768 kr/mån,False,False,Slutpris11 500 000 kr,-4 %,100 877 kr/m²,Såld 14 aug. 2022
4,"City, Stockholms kommun",Gamla Brogatan 25,71 m²,2 rum,4 696 kr/mån,True,False,Slutpris6 500 000 kr,-7 %,91 549 kr/m²,Såld 30 aug. 2021
5,"Vasastan - City/Norrmalm, Stockholms kommun","Gamla Brogatan 25, 2tr",114 m²,"3,5 rum",6 769 kr/mån,False,False,Slutpris7 600 000 kr,-5 %,66 667 kr/m²,Såld 13 apr. 2018
6,"Vasastan- City/ Norrmalm, Stockholms kommun","Gamla Brogatan 25, 2 tr",71 m²,2 rum,4 696 kr/mån,False,False,Slutpris5 050 000 kr,+4 %,71 127 kr/m²,Såld 23 jun. 2016
7,"Vasastan- City/ Norrmalm, Stockholms kommun",Gamla Brogatan 25,102 m²,4 rum,6 519 kr/mån,False,False,Slutpris6 950 000 kr,-9 %,68 137 kr/m²,Såld 29 apr. 2016
8,"Vasastan - City/Norrmalm, Stockholms kommun","Gamla Brogatan 25, 2tr",107 m²,4 rum,6 713 kr/mån,False,False,Slutpris7 150 000 kr,+2 %,66 822 kr/m²,Såld 26 nov. 2015


In [27]:
def process_all_files(directory):
    data = []

    for filename in os.listdir(directory):

        if filename.endswith('.html'):
            file_path = os.path.join(directory, filename)
            
            df = html_to_df(file_path)
            data.append(df)
    
    final_df = pd.concat(data, ignore_index=True)
    return final_df

In [29]:
directory = '/Users/janstein/Documents/DEV/Portfolio/hemnet_prices/hemnet_listings'
final_df = process_all_files(directory)

In [30]:
final_df

Unnamed: 0,location,address,size,rooms,monthly_fee,elevator,balcony,selling_price,percentage_increase,square_meter_price,date_sale
0,,,,,,False,False,,,,
1,,,,,,False,False,,,,
2,,,,,,False,False,,,,
3,,,,,,False,False,,,,
4,,,,,,False,False,,,,
...,...,...,...,...,...,...,...,...,...,...,...
219515,,,,,,False,False,,,,
219516,,,,,,False,False,,,,
219517,,,,,,False,False,,,,
219518,,,,,,False,False,,,,


In [33]:
def data_cleaner(df):

    df = df.copy()

    # removing missing values
    df = df[df['address'].notna()]

    # splitting location into neighbourhood and kommun
    location_split = df['location'].str.split(',', expand=True)
    df.loc[:, 'neighbourhood'] = location_split[0].str.strip()
    df.loc[:, 'kommun'] = location_split[1].str.strip()

    df['address'] = df['address'].str.split(',').str[0].str.strip()

    df['size'] = (df['size'].str.split('+').str[0]
                  .str.replace('m²', '', regex=True)
                  .str.replace(' ', '')
                  .str.replace(',', '.')
                  .str.strip()
                  .pipe(pd.to_numeric, errors='coerce'))

    df['rooms'] = (df['rooms'].str.replace(' ', '')
                   .str.replace('rum', '')
                   .str.replace(',', '.')
                   .str.replace('\xa0', '')
                   .pipe(pd.to_numeric, errors='coerce'))

    df['monthly_fee'] = (df['monthly_fee'].str.replace('\xa0', '')
                         .str.replace(' ', '')
                         .str.replace('kr/mån', '', regex=True)
                         .pipe(pd.to_numeric, errors='coerce'))

    df['selling_price'] = (df['selling_price'].str.replace('\xa0', '')
                           .str.replace('Slutpris', '')
                           .str.replace(' ', '')
                           .str.replace('kr', '')
                           .pipe(pd.to_numeric, errors='coerce'))

    df['percentage_increase'] = (df['percentage_increase'].str.replace(' ', '')
                                 .str.replace('%', '')
                                 .str.replace('+', '')
                                 .str.replace('±', '')
                                 .str.replace('\xa0', '')
                                 .str.strip()
                                 .pipe(pd.to_numeric, errors='coerce'))

    df['square_meter_price'] = (df['square_meter_price'].str.replace(' ', '')
                                .str.replace('kr/m²', '', regex=True)
                                .str.replace('\xa0', '')
                                .pipe(pd.to_numeric, errors='coerce'))

    df['date_sale'] = df['date_sale'].str.replace('Såld ', '').str.strip()

    month_mapping = {
        'jan.': 'Jan',
        'feb.': 'Feb',
        'mar.': 'Mar',
        'apr.': 'Apr',
        'maj.': 'May',
        'jun.': 'Jun',
        'jul.': 'Jul',
        'aug.': 'Aug',
        'sep.': 'Sep',
        'okt.': 'Oct',
        'nov.': 'Nov',
        'dec.': 'Dec'
    }
    
    df['date_sale'] = df['date_sale'].replace(month_mapping, regex=True)
    df['date_sale'] = pd.to_datetime(df['date_sale'], format='%d %b %Y', errors='coerce')

    df.sort_values(by='date_sale', ascending=True, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [114]:
hemnet_df = data_cleaner(final_df)
hemnet_df

Unnamed: 0,location,address,size,rooms,monthly_fee,elevator,balcony,selling_price,percentage_increase,square_meter_price,date_sale,neighbourhood,kommun
0,"Kungsholmen, Stockholms kommun",Norr Mälarstrand 8,85.0,2.5,3542.0,False,False,9700000,69.0,114118.0,2011-02-09,Kungsholmen,Stockholms kommun
1,"Vasastan, Stockholms kommun",Birger Jarlsgatan 109 A,81.0,3.0,3718.0,False,False,4600000,19.0,56790.0,2011-04-08,Vasastan,Stockholms kommun
2,"Vasastan/Östermalm, Stockholms kommun",Birger Jarlsgatan 103B,30.0,2.0,1637.0,False,False,2300000,15.0,76667.0,2012-06-13,Vasastan/Östermalm,Stockholms kommun
3,"Hjorthagen, Stockholms kommun",Ahlsellvägen 10,,,2707.0,False,False,2250000,0.0,,2012-10-26,Hjorthagen,Stockholms kommun
4,"Gärdet, Stockholms kommun",Smedsbacksgatan 26,,,1862.0,False,False,1900000,3.0,,2012-11-01,Gärdet,Stockholms kommun
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91725,"Södermalm - Mariatorget, Stockholms kommun",Maria Prästgårdsgata 19A,78.0,3.0,5409.0,False,False,9000000,0.0,115385.0,2024-02-07,Södermalm - Mariatorget,Stockholms kommun
91726,"Gröndal, Stockholms kommun",Sjöbjörnsvägen 46,58.0,2.5,4015.0,False,False,4500000,7.0,77586.0,2024-02-07,Gröndal,Stockholms kommun
91727,"Gärdet, Stockholms kommun",Rindögatan 14,27.0,1.0,2195.0,True,True,3210000,9.0,118889.0,2024-02-07,Gärdet,Stockholms kommun
91728,"Södermalm, Stockholms kommun",Hallandsgatan 24,100.0,3.0,4997.0,True,False,8625000,-4.0,86250.0,2024-02-07,Södermalm,Stockholms kommun


In [113]:
unique_neighbourhoods = hemnet_df['neighbourhood'].unique()
print(len(unique_neighbourhoods))

1444


### Creating a mapping dictionary for the different versions of neighbourhoods used

In [112]:
sthlm_kvarter = {
    'Fredhäll': [],
    'Kristineberg': [],
    'Kungsholmen': ['Kungholmen', 'Thorildsplan', 'Fridhemsplan', 'Karlsviksgatan','Norr Mälarstrand', 'Eriksområdet', 'Kungsholms', 'Rådhuset', 'Rådhusparken', 'Kungsklippan', 'Kungsholm', 'Kunsgholmen', 'Kungshomen', 'Kugsholmen', 'Kronobergsparken'],
    'Sankt Göran': [],
    'Marieberg': [],
    'Lilla Essingen': ['Lilla Essinge'],
    'Stadshagen': ['Hornsberg', 'Hornsbergs', 'Hornbergs', 'Lindhagen', 'Kellgrensgatan'],
    'Stora Essingen': ['Essingeöarna'],
    'Djurgården': [],
    'Hjorthagen': ['Abessinien'],
    'Gärdet': ['Starrängsringen', 'Tessinparken'],
    'Norra Djurgården': ['Norra Djurgårdsstaden', 'Ruddammen', 'Ekhagen', 'Djurgårdsstaden', 'Roslagstull', 'Norrtull'],
    'Norrmalm': ['City'],
    'Skeppsholmen': [],
    'Vasastan': ['Odenplan', 'Röda Bergen', 'Rödabergen', 'Sibirien', 'Vasastaden', 'Atlas', 'Vasatan', 'Vasstan', 'Sankt Eriksplan', 'Eriksplan', 'Birkastan', 'Birkastaden', 'Tegnérlunden', 'Vasaparken', 'Vastastan', 'Torsplan', 'Sankt Erik'],
    'Hagastaden': [],
    'Östermalm': ['Karlaplan', 'Österrmalm', 'Lärkstaden', 'Lärkstan', 'Strandvägen', 'Östermallm', 'Östermam', 'Östermlam', 'Humlegården', 'Stureplan', 'Djurgårdsbron', 'Östremalm', 'Stureparken', 'Villastan', 'Villastaden', 'Gustav Adolfsparken'],
    'Gamla Stan': [],
    'Långholmen': [],
    'Reimersholme': [],
    'Riddarholmen': [],
    'Södermalm': ['Skanstull', 'Eriksdal', 'Medborgarplatsen', 'Söder', 'Björnsträdgård', 'Södra Station', 'Södemalm', 'Helgalunden'],
    'Högalid': ['Zinkensdamm', 'Skinnarviksberget'],
    'Maria': [],
    'Katarina': ['Mosebacke'],
    'Sofia': ['Nytorget', 'SoFo', 'Barnängen', 'Barnängsområdet'],
    'Hornstull': ['Tanto', 'Bergsunds'],
    'Hammarby Sjöstad': ['Hammarbyhamnen', 'Hammarby', 'Luma'],
    'Gröndal': ['Ekensberg'],
    'Årsta': ['Gullmarsplan', 'Sköntorp'],
    'Liljeholmen': ['Liljeholmskajen', 'Liljeholmstorget', 'Liljeholmkajen', 'Liljeolmskajen', 'Lijeholmen', 'Lijeholmskajen', 'Nybohovshöjden', 'Nybohovsbacken', 'Nybodahöjden', 'Nybohov', 'Lilje', 'Sjövikshöjden'],
    'Enskede': [],
    'Hägersten': [],
    'Sickla': ['Henriksdal', 'Danvikstull', 'Danviksklippan'],
    'delete': ['Lägenheten', 'stockholm', 'tullarna', 'Nacka Strand', 'Visning']
}

remaining = []

for neighbourhood in unique_neighbourhoods:
    matched = False
    neighbourhood_lower = neighbourhood.lower()
    
    for key, variations in sthlm_kvarter.items():
        all_variations = [key.lower()] + [variation.lower() for variation in variations]

        if any(variation in neighbourhood_lower for variation in all_variations):

            sthlm_kvarter[key].append(neighbourhood)
            matched = True
            break
    
    if not matched:
        remaining.append(neighbourhood)

print(len(remaining))

0


### Harmonising neighbourhood names

In [133]:
for index, row in hemnet_df.iterrows():
    original_neighbourhood = row['neighbourhood'].lower()

    for standard_neighbourhood, variations in sthlm_kvarter.items():
        variations = [standard_neighbourhood.lower()] + [var.lower() for var in variations]

        if any(var in original_neighbourhood for var in variations):
            hemnet_df.at[index, 'neighbourhood'] = standard_neighbourhood
            break

rows_to_delete = hemnet_df.loc[hemnet_df['neighbourhood'].str.lower() == 'delete'].index
hemnet_df = hemnet_df.drop(rows_to_delete)

hemnet_df.head(5)

Unnamed: 0,size,rooms,monthly_fee,elevator,balcony,selling_price,percentage_increase,square_meter_price,date_sale,neighbourhood,kommun,street_name,house_number
0,85.0,2.5,3542.0,False,False,9700000,69.0,114118.0,2011-02-09,Kungsholmen,Stockholms kommun,Norr Mälarstrand,8
1,81.0,3.0,3718.0,False,False,4600000,19.0,56790.0,2011-04-08,Vasastan,Stockholms kommun,Birger Jarlsgatan,109 A
2,30.0,2.0,1637.0,False,False,2300000,15.0,76667.0,2012-06-13,Vasastan,Stockholms kommun,Birger Jarlsgatan,103B
136,111.0,4.0,3190.0,False,False,7450000,0.0,67117.0,2013-01-29,Kungsholmen,Stockholms kommun,Scheelegatan,26
140,84.0,2.5,3732.0,False,False,3775000,2.0,44940.0,2013-01-30,Kungsholmen,Stockholms kommun,Badstrandsvägen,26


### Splitting address into street_name and house_number

(?P<street_name>[^\d]+): Matches and captures one or more characters that are not digits as 'street_name'.

\s?: Matches zero or one whitespace character (accounts for addresses that might not have a space before the house number).

(?P<house_number>\d+.*): Matches and captures a sequence starting with one or more digits followed by any characters, as 'house_number'.

In [None]:
# regex pattern assumes that the street name doesn't contain digits and that the house number starts with digits
pattern = r'(?P<street_name>[^\d]+)\s?(?P<house_number>\d+.*)'

address_parts = hemnet_df['address'].str.extract(pattern)
hemnet_df = hemnet_df.join(address_parts)

### Dropping columns that are now redundant

In [None]:
hemnet_df = hemnet_df.drop(['location', 'address'], axis=1)

### Checking for missing values

In [135]:
hemnet_df.isna().sum()

size                   0
rooms                  0
monthly_fee            0
elevator               0
balcony                0
selling_price          0
percentage_increase    0
square_meter_price     0
date_sale              0
neighbourhood          0
kommun                 0
street_name            0
house_number           0
dtype: int64

### Dropping rows with missing values
Due to the relatively low number of missing values, dropping these rows significantly impact the dataset

In [136]:
columns_to_check = ['street_name', 'house_number']
hemnet_df = hemnet_df.dropna(subset=columns_to_check)

hemnet_df.isna().sum()

size                   0
rooms                  0
monthly_fee            0
elevator               0
balcony                0
selling_price          0
percentage_increase    0
square_meter_price     0
date_sale              0
neighbourhood          0
kommun                 0
street_name            0
house_number           0
dtype: int64

In [137]:
hemnet_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89386 entries, 0 to 91729
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   size                 89386 non-null  float64       
 1   rooms                89386 non-null  float64       
 2   monthly_fee          89386 non-null  float64       
 3   elevator             89386 non-null  bool          
 4   balcony              89386 non-null  bool          
 5   selling_price        89386 non-null  int64         
 6   percentage_increase  89386 non-null  float64       
 7   square_meter_price   89386 non-null  float64       
 8   date_sale            89386 non-null  datetime64[ns]
 9   neighbourhood        89386 non-null  object        
 10  kommun               89386 non-null  object        
 11  street_name          89386 non-null  object        
 12  house_number         89386 non-null  object        
dtypes: bool(2), datetime64[ns](1), float

In [138]:
hemnet_df.to_csv('hemnet_df.csv', index=False)

# Success
**Extracted and cleaned a dataset of all available finalised sale listings from Hemnets website**