In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup # For web scraping
from geopy.geocoders import Nominatim
import folium

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
url = 'https://en.wikipedia.org/wiki/Districts_of_Turku'
response = requests.get(url)
response

<Response [200]>

In [5]:
turku_soup = BeautifulSoup(response.content)
district_tables = turku_soup.find_all('table', attrs={'class':'multicol'})

In [6]:
turku_districts = [a.text for table in district_tables for a in table.find_all('a')]
turku_districts = list(set(turku_districts) - {str(n) for n in range(1, 7)})
turku_districts = [district.split(' District')[0] for district in turku_districts]
turku_districts_df = pd.DataFrame(turku_districts, columns=['District'])
turku_districts_df.head()

Unnamed: 0,District
0,Raunistula
1,Ispoinen
2,Kupittaa
3,III
4,Kohmo


In [7]:
geolocator = Nominatim(user_agent='turku_finder')

In [8]:
district_latlong = [{'Latitude': geolocator.geocode(f"{district}, Turku").latitude, 
                     'Longitude': geolocator.geocode(f"{district}, Turku").longitude, 
                     'OSM_ID': geolocator.geocode(f"{district}, Turku").raw.get('osm_id')
                    }
                    for district in turku_districts_df['District'].to_numpy()
                   ]

In [9]:
turku_districts_df.size

79

In [10]:
turku_districts_df = turku_districts_df.join(pd.DataFrame(district_latlong), how = 'left')

In [11]:
turku_districts_df

Unnamed: 0,District,Latitude,Longitude,OSM_ID
0,Raunistula,60.464067,22.277629,8837205
1,Ispoinen,60.423706,22.273103,10762765
2,Kupittaa,60.447529,22.295501,25426008
3,III,60.443628,22.266857,8837201
4,Kohmo,60.45845,22.350649,10762964
5,Vasaramäki,60.439934,22.310273,10762971
6,Itäharju,60.449068,22.308755,10762972
7,Luolavuori,60.432101,22.279726,10762772
8,Perno,60.456526,22.160641,10761592
9,Moikoinen,60.41803,22.237311,10762421


In [17]:
# For future ease

turku_districts_df.to_csv('turku_districts.csv', index=False)

In [12]:
latitude, longitude = 60.275, 22.167

map_tku = folium.Map(location=[latitude, longitude], zoom_start=9)
  
for lat, lng, dist in zip(turku_districts_df['Latitude'], turku_districts_df['Longitude'], turku_districts_df['District']):
    label = dist
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], 
        radius=5,
        popup=label,
        color='#f96706',
        weight=2,
        fill=True,
        fill_color='#F9E106',
        fill_opacity=0.8,
        parse_html=False).add_to(map_tku)
    
map_tku

In [13]:
import urllib, json 
from geojson import FeatureCollection, dump
import sys, os

In [78]:
polygon_cache = dict()

def get_polygon(id):
    if (id in polygon_cache):
        pass
        
    json_url = f"http://polygons.openstreetmap.fr/get_geojson.py?id={id}&params=0"
    response = urllib.request.urlopen(json_url)
    try:
        gj = json.loads(response.read())
        feature_collection = FeatureCollection(gj)
        
        script_dir = os.path.dirname(os.path.abspath('__file__'))
        dest_dir = os.path.join(script_dir, "geojson")
        try:
            os.makedirs(dest_dir)
        except OSError:
            pass
        file_name = f"id_{id}.json"
        path = os.path.join(dest_dir, file_name)
        with open(path, 'w') as f:
            dump(feature_collection, f)
        polygon_cache[id] = gj
        
    except:
        print(f"Id {id} didn't produce a geojson file")

In [None]:
# For retrieving polygon_cache from the files in the geojson directory

def get_polygons_from_local():
    polygon_dict = dict()
    script_dir = os.path.dirname(os.path.abspath('__file__'))
    dest_dir = os.path.join(script_dir, "geojson")
    
    json_files = [file for file in os.listdir(dest_dir) if '.json' in file]
    
    for file in json_files:
        file_id = file.replace('id_', '')
        file_dir = os.path.join(dest_dir, file)
        
        with open(file_dir, 'r') as f:
            data = f.read()
            polygon_dict[file_id] = json.loads(data)['features']

    return polygon_dict

In [15]:
for id in turku_districts_df['OSM_ID'].to_numpy():
    get_polygon(str(id))

Id 25426008 didn't produce a geojson file
Id 32426215 didn't produce a geojson file
Id 259887364 didn't produce a geojson file
Id 26268396 didn't produce a geojson file
Id 612493471 didn't produce a geojson file
Id 32426230 didn't produce a geojson file
Id 32426216 didn't produce a geojson file
Id 32426228 didn't produce a geojson file
Id 32426241 didn't produce a geojson file
Id 32426247 didn't produce a geojson file
Id 25439723 didn't produce a geojson file
Id 618800549 didn't produce a geojson file
Id 26736838 didn't produce a geojson file
Id 6127718384 didn't produce a geojson file


In [70]:
polygon_keys = [int(key) for key in list(polygon_cache.keys())]
missing_keys = list(set(turku_districts_df['OSM_ID'].values) - set(polygon_keys))
turku_districts_df[turku_districts_df['OSM_ID'].isin(missing_keys)]

Unnamed: 0,District,Latitude,Longitude,OSM_ID
2,Kupittaa,60.447529,22.295501,25426008
19,Kurjenmäki,60.440492,22.285279,32426215
21,Turku Airport,60.51339,22.263337,259887364
24,Iso-Heikkilä,60.45054,22.230574,26268396
26,VII,60.4544,22.253789,612493471
29,Jänessaari,60.421563,22.182297,32426230
31,Korppolaismäki,60.431529,22.235856,32426216
32,Oriniemi,60.397141,22.169158,32426228
44,Särkilahti,60.416012,22.195466,32426241
52,Paattinen,60.593798,22.383376,32426247


Naturally, some data doesn't align properly, and part of this is due to a region perhaps not being the first result from a Nominatim query, so let's manually fix these results.

In [77]:
turku_districts_df.loc[2, 'OSM_ID'] = 8837204
turku_districts_df.loc[19, 'OSM_ID'] = 10762760
turku_districts_df.drop(21, axis=0, inplace=True) # Nominatim does not have a region for the airport, and I don't think it is the best region anyway
turku_districts_df.loc[24, 'OSM_ID'] = 10762963
turku_districts_df.loc[26, 'OSM_ID'] = 8837207
turku_districts_df.loc[29, 'OSM_ID'] = 10761568
turku_districts_df.loc[31, 'OSM_ID'] = 10761470
turku_districts_df.loc[32, 'OSM_ID'] = 10762411
turku_districts_df.loc[44, 'OSM_ID'] = 10762414
turku_districts_df.loc[52, 'OSM_ID'] = 10764096
turku_districts_df.loc[53, 'OSM_ID'] = 10762764
turku_districts_df.loc[66, 'OSM_ID'] = 10761468
turku_districts_df.loc[71, 'OSM_ID'] = 10762761
turku_districts_df.loc[77, 'OSM_ID'] = 8837202

In [79]:
for id in turku_districts_df['OSM_ID'].to_numpy():
    get_polygon(str(id))

In [80]:
polygon_keys = [int(key) for key in list(polygon_cache.keys())]
missing_keys = list(set(turku_districts_df['OSM_ID'].values) - set(polygon_keys))
len(missing_keys)

0

In [81]:
latitude, longitude = 60.458038443754916, 22.267463230122978

map_tku = folium.Map(location=[latitude, longitude], zoom_start=11)

for gj in polygon_cache.values():
    folium.GeoJson(gj).add_to(map_tku)

for lat, lng, dist in zip(turku_districts_df['Latitude'], turku_districts_df['Longitude'], turku_districts_df['District']):
    label = dist
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], 
        radius=5,
        popup=label,
        color='#f96706',
        weight=2,
        fill=True,
        fill_color='#F9E106',
        fill_opacity=0.8,
        parse_html=False).add_to(map_tku)

map_tku