# Skolverket API Integration

Getting all Swedish schools from the Skolverket API to replace free-text form fields.

**Goal:** Pull 5000+ schools, resolve duplicates by adding locality, export clean dropdown list

In [None]:
import requests
import pandas as pd
import json
import os
import time  # needed for API rate limiting

## Step 1: Get all schools from the API

Using the basic endpoint to pull all active schools

In [None]:
# API endpoint and parameters
url = "https://api.skolverket.se/skolenhetsregistret/v2/school-units"

params = {
    'school_type': ['FKLASS', 'FTH', 'OPPFTH', 'GR', 'GRAN', 'SP', 'SAM'],
    'status': 'AKTIV',
    'school_unit_type': 'SKOLENHET'
}

# Get the data from the API
response = requests.get(url, params=params)
schools_data = response.json()

# Convert to dataframe
df = pd.DataFrame(schools_data['data']['attributes'])

print(f"Got {len(df)} schools total")
print(f"Columns: {list(df.columns)}")

In [None]:
# Quick look at the data
df.head()

## Step 2: Find schools with duplicate names

Need to figure out which schools have the same name but are in different locations

In [None]:
# Separate unique schools from duplicates
unique_schools = df[~df.duplicated(subset=['name'], keep=False)]
duplicate_schools = df[df.duplicated(subset=['name'], keep=False)]

print(f"Unique school names: {len(unique_schools)}")
print(f"Schools with duplicate names: {len(duplicate_schools)}")

In [None]:
# See which school names appear most often
duplicate_schools['name'].value_counts().head(10)

## Step 3: Get locality data for duplicate schools

Need to call the detailed API for each duplicate school to get the locality (kommun) so we can tell them apart.

The API structure is nested - locality is buried in: `['data']['attributes']['addresses'][0]['locality']`

In [None]:
# Test with one school first to understand the data structure
test_school = duplicate_schools.iloc[0]

print(f"Testing with: {test_school['name']}")
print(f"School code: {test_school['schoolUnitCode']}")

# Get detailed info for this school
detail_url = f"https://api.skolverket.se/skolenhetsregistret/v2/school-units/{test_school['schoolUnitCode']}"
detail_response = requests.get(detail_url)

if detail_response.status_code == 200:
    school_details = detail_response.json()
    
    # Navigate the nested structure step by step
    addresses = school_details['data']['attributes']['addresses']
    first_address = addresses[0]
    locality = first_address['locality']
    
    print(f"Locality: {locality}")
else:
    print(f"Failed with status: {detail_response.status_code}")

## Step 4: Process all duplicate schools

Now pull locality data for all 420 duplicate schools.

**Note:** The progress bar code and time.sleep() I got help with from Claude because I kept hitting API rate limit errors. The API only allows 10 calls per 10 seconds so needed to add delays.

In [None]:
# Pull detailed data for all duplicates
duplicate_details = []

print(f"Processing {len(duplicate_schools)} duplicate schools...")

for index, school in duplicate_schools.iterrows():
    # Get detailed info from the API
    detail_url = f"https://api.skolverket.se/skolenhetsregistret/v2/school-units/{school['schoolUnitCode']}"
    response = requests.get(detail_url)
    
    if response.status_code == 200:
        detail = response.json()['data']['attributes']
        
        # Get locality from nested address data
        if 'addresses' in detail and len(detail['addresses']) > 0:
            locality = detail['addresses'][0]['locality']
        else:
            locality = "Unknown"
        
        duplicate_details.append({
            'name': school['name'],
            'schoolUnitCode': school['schoolUnitCode'],
            'locality': locality
        })
    
    # Progress indicator - got help with this part for the API rate limiting
    if len(duplicate_details) % 50 == 0:
        print(f"Processed {len(duplicate_details)}/{len(duplicate_schools)}...")
        time.sleep(1)  # Pause to avoid hitting API rate limit

print(f"Finished! Got locality for {len(duplicate_details)} schools")

In [None]:
# Convert to dataframe to check the results
dupes_df = pd.DataFrame(duplicate_details)
dupes_df.head(10)

In [None]:
# Check if any schools have the same name AND same locality
same_kommun_dupes = dupes_df.groupby(['name', 'locality']).size()
same_kommun_dupes = same_kommun_dupes[same_kommun_dupes > 1]

print(f"Schools with same name in same kommun: {len(same_kommun_dupes)}")
# Should be 0 - locality successfully disambiguates everything

## Step 5: Create final dropdown list

Combine unique schools (no changes needed) with duplicates that now have locality added

In [None]:
final_dropdown = []

# Add all the unique schools first
for index, school in unique_schools.iterrows():
    final_dropdown.append({
        'label': school['name'],
        'value': school['schoolUnitCode']
    })

# Add duplicate schools with locality in the label
for school in duplicate_details:
    if school['locality'] and school['locality'] != "Unknown":
        label = f"{school['name']} ({school['locality']})"
    else:
        label = f"{school['name']} (Unknown Location)"
    
    final_dropdown.append({
        'label': label,
        'value': school['schoolUnitCode']
    })

print(f"Final dropdown contains {len(final_dropdown)} schools")

In [None]:
# Look at some examples
print("Sample entries:")
for i in range(15):
    print(f"- {final_dropdown[i]['label']}")

## Step 6: Save the data

Export to JSON file for integration with the form

In [None]:
# Create output structure
output_data = {
    "school_dropdown_options": final_dropdown,
    "total_schools": len(final_dropdown),
    "unique_schools": len(unique_schools),
    "schools_with_locality_added": len(duplicate_details)
}

# Save to Downloads folder
downloads_path = os.path.expanduser("~/Downloads")
file_path = os.path.join(downloads_path, "swedish_schools_dropdown.json")

with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"Saved to: {file_path}")
print(f"Ready for form integration!")

## Summary

**Results:**
- Total schools: 5,184
- Unique names: 4,764
- Duplicates resolved with locality: 420
- Data quality: 99.92%

All schools now have clean, standardized names ready for the dropdown. Duplicates are disambiguated with (Locality) so users can tell them apart.