# CC5 Scraper Task: Population Density in Swedish Municipalities (kommunes)

This notebook scrapes population density data from [Wikipedia - Lista över Sveriges kommuner](https://sv.wikipedia.org/wiki/Lista_över_Sveriges_kommuner) (List of Swedish municipalities). We use  `pandas.read_html()` to extract HTML tables directly from the webpage. The CSV has been further tidied for use with a Vega-Lite choropleth map.

In [105]:
# Import packages
import pandas as pd
import numpy as np

In [106]:
import requests

# Fetch tables from Wikipedia
url = "https://sv.wikipedia.org/wiki/Lista_över_Sveriges_kommuner"

# Add a User-Agent header to mimic a web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Use requests to get the HTML content with the header
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an exception for HTTP errors

# pd.read_html() returns a list of all tables found on the page
tables = pd.read_html(response.text)
print(f"Found {len(tables)} tables on the page")

Found 2 tables on the page


  tables = pd.read_html(response.text)


In [107]:
# Find the main municipalities table (should have ~290 rows) and select it
df = None
for table in tables:
    if len(table) >= 280 and len(table) <= 300:
        df = table
        print(f"Found municipalities table with {len(df)} rows")
        break


df.columns = df.iloc[1]  # Use row 1 as headers
df = df.iloc[2:].reset_index(drop=True)  # Keep only data rows

print(f"\nColumn names: {list(df.columns)}")
print(f"Rows: {len(df)}")


Found municipalities table with 292 rows

Column names: ['Kod', 'Kommun', 'Centralort', 'Län', 'Folkmängd', 'Area', 'Land', 'Sjö1', 'Sjö2', 'Hav', 'Täthet']
Rows: 290


In [108]:
# Extract the columns by position
df_clean = pd.DataFrame()

df_clean['code'] = df.iloc[:, 0].astype(str)

df_clean['municipality'] = df.iloc[:, 1].astype(str)

density_col = df.iloc[:, -1]
print(f"Density column name: {df.columns[-1]}")
print(f"Sample density values (raw): {density_col.head(5).tolist()}")

df_clean.head()

Density column name: Täthet
Sample density values (raw): ['10282', '9070', '2029', '1324', '4283']


Unnamed: 0,code,municipality
0,1440,Ale kommun
1,1489,Alingsås kommun
2,764,Alvesta kommun
3,604,Aneby kommun
4,1984,Arboga kommun


In [109]:
# Clean density values

density_raw = df['Täthet'].astype(str)

density_cleaned = density_raw.str.replace(' ', '', regex=False)
density_cleaned = density_cleaned.str.replace('\xa0', '', regex=False)
density_cleaned = density_cleaned.str.replace(',', '', regex=False)
density_cleaned = density_cleaned.str.replace('.', '', regex=False)

df_clean['density'] = pd.to_numeric(density_cleaned, errors='coerce') / 100

display(df_clean)

Unnamed: 0,code,municipality,density
0,1440,Ale kommun,102.82
1,1489,Alingsås kommun,90.70
2,0764,Alvesta kommun,20.29
3,0604,Aneby kommun,13.24
4,1984,Arboga kommun,42.83
...,...,...,...
285,0117,Österåkers kommun,159.74
286,0382,Östhammars kommun,15.06
287,1256,Östra Göinge kommun,31.87
288,2513,Överkalix kommun,1.16


In [110]:
# Brute force clean municipality names

df_clean['municipality'] = df_clean['municipality'].str.replace(' kommun', '', regex=False)

keep = {
    'Bengtsfors', 'Degerfors', 'Hagfors', 'Hofors', 'Hällefors',
    'Kramfors', 'Munkfors', 'Robertsfors', 'Storfors', 'Vilhelmina',
    'Grums', 'Bollnäs', 'Alingsås', 'Borås', 'Höganäs', 'Nässjö',
    'Tranås', 'Västerås', 'Mönsterås', 'Sotenäs', 'Strängnäs',
    'Vännäs', 'Torsås', 'Forshaga'
}

def fix_genitive(name):
    # If it's in our keep list, don't modify
    if name in keep:
        return name
    # Otherwise remove trailing 's' if present
    if name.endswith('s'):
        return name[:-1]
    return name

df_clean['municipality'] = df_clean['municipality'].apply(fix_genitive)

name_fixes = {
    'Bengtsfor': 'Bengtsfors',
    'Degerfor': 'Degerfors',
    'Grum': 'Grums',
    'Hagfor': 'Hagfors',
    'Hofor': 'Hofors',
    'Hällefor': 'Hällefors',
    'Höganä': 'Höganäs',
    'Kramfor': 'Kramfors',
    'Munkfor': 'Munkfors',
    'Robertsfor': 'Robertsfors',
    'Sotenä': 'Sotenäs',
    'Storfor': 'Storfors',
    'Strängnä': 'Strängnäs',
    'Vännä': 'Vännäs',
}

df_clean['municipality'] = df_clean['municipality'].replace(name_fixes)

# Ensure code has leading zeros (should be 4 digits)

df_clean['code'] = df_clean['code'].str.zfill(4)

print(f"Sample municipality names (cleaned):")
print(df_clean['municipality'].head(10).tolist())

before_count = len(df_clean)
df_clean = df_clean.dropna(subset=['density'])
after_count = len(df_clean)

print(f"Rows before cleaning: {before_count}")
print(f"Rows after cleaning: {after_count}")
print(f"Removed {before_count - after_count} rows with missing density")

Sample municipality names (cleaned):
['Ale', 'Alingsås', 'Alvesta', 'Aneby', 'Arboga', 'Arjeplog', 'Arvidsjaur', 'Arvika', 'Askersund', 'Avesta']
Rows before cleaning: 290
Rows after cleaning: 290
Removed 0 rows with missing density


In [111]:
# Reorder columns for output
df_final = df_clean[['municipality', 'code', 'density']].copy()

print(f"\nFinal dataset: {len(df_final)} municipalities")
print(f"\nTop 10 by density:")
print(df_final.nlargest(10, 'density').to_string(index=False))

print(f"\nBottom 10 by density:")
print(df_final.nsmallest(10, 'density').to_string(index=False))


Final dataset: 290 municipalities

Top 10 by density:
municipality code  density
  Sundbyberg 0183  6543.84
   Stockholm 0180  5336.51
       Solna 0184  4481.13
       Malmö 1280  2350.80
    Järfälla 0123  1677.32
     Lidingö 0186  1573.02
  Sollentuna 0163  1483.14
    Göteborg 1480  1369.23
        Täby 0160  1290.06
    Danderyd 0162  1234.20

Bottom 10 by density:
municipality code  density
    Arjeplog 2506     0.20
    Jokkmokk 2510     0.27
     Sorsele 2422     0.32
       Åsele 2463     0.64
      Pajala 2521     0.73
    Storuman 2421     0.76
  Vilhelmina 2462     0.77
     Dorotea 2425     0.81
  Härjedalen 2361     0.89
    Älvdalen 2039     0.99


In [112]:
# Export to CSV and download
df_final.to_csv('cc5_data2.csv', index=False)

print("Data exported to cc5_data2.csv")
print(f"\nFirst 10 rows:")
print(df_final.head(10).to_string(index=False))

from google.colab import files

files.download('cc5_data2.csv')

Data exported to cc5_data2.csv

First 10 rows:
municipality code  density
         Ale 1440   102.82
    Alingsås 1489    90.70
     Alvesta 0764    20.29
       Aneby 0604    13.24
      Arboga 1984    42.83
    Arjeplog 2506     0.20
  Arvidsjaur 2505     1.07
      Arvika 1784    15.43
   Askersund 1882    13.97
      Avesta 2084    36.55


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>