In [None]:
!pip install geopy contextily

# Café Pre-Processing
In this notebook, we prepare the café data obtained from OpenStreetMap (OSM) for my café mapping project.

In [87]:
import urllib.request
import geopandas as gpd
from tqdm import tqdm
from geopy.geocoders import Nominatim, ArcGIS, Bing

Let's start by loading the data.

In [95]:
cafes: gpd.GeoDataFrame = gpd.read_file('cafes-osm-raw.geojson')
# print(cafes.columns)
print(cafes.crs)
cafes.head(2)

EPSG:4326


Unnamed: 0,id,@id,addr:city,addr:country,addr:housename,addr:housenumber,addr:postcode,addr:street,air_conditioning,alt_name,...,source,takeaway,toilets:wheelchair,website,website:menu,wheelchair,wheelchair:description,wikidata,wikipedia,geometry
0,way/64132690,way/64132690,,,,,,,,,...,,,,,,,,,,"POLYGON ((13.04248 47.79984, 13.04249 47.79971..."
1,way/104973073,way/104973073,Salzburg,AT,,8.0,5020.0,Siebenbürgerstraße,,,...,,,,,,,,,,"POLYGON ((13.02030 47.81832, 13.02028 47.81834..."


There are way too many columns, so let's do some radical filtering.

In [96]:
cafes = cafes[['name', 'opening_hours', 'website', 'geometry', 'addr:street', 'addr:housenumber']]
cafes = cafes.dropna(subset='name')

Next, we convert any polygon locations to point locations.

In [97]:
cafes = cafes.to_crs(epsg=3857)
cafes.loc[cafes['geometry'].type == 'Polygon', 'geometry'] = cafes[cafes['geometry'].type == 'Polygon'].centroid
cafes = cafes.to_crs(epsg=4326)
cafes.head()

Unnamed: 0,name,opening_hours,website,geometry,addr:street,addr:housenumber
0,Fabrizi Espresso,,,POINT (13.04251 47.79978),,
1,7-Bürger Café,"Tu,We,Fr,Su 08:00-18:00; Th off; Mo,Sa 08:00-1...",,POINT (13.02033 47.81828),Siebenbürgerstraße,8.0
2,Orangerie,,,POINT (13.06083 47.76207),,
3,Cafe Habakuk,Mo-Th 07:00-19:00; Fr 07:00-20:00; Sa 07:00-18:00,,POINT (13.04645 47.80295),Linzer Gasse,26.0
4,Cafe Classic,Mo-Sa 08:00-19:30,,POINT (13.04387 47.80269),Makartplatz,8.0


In [98]:
# we use the ArcGIS geocoder to obtain an address line for missing addresses
geocoding_addresses: list[str] = []
geocoder: ArcGIS = ArcGIS(user_agent='coffee-map-PLUS')

# start geocoding
for index, row in tqdm(cafes.iterrows(), total=cafes.shape[0]):
  # address was entered on OSM
  if row['addr:street'] is not None:
    geocoding_addresses.append(f'{row["addr:street"]} {row["addr:housenumber"]}, Salzburg, Salzburg 5020, Austria')
  else:
    geocoding_addresses.append(
        geocoder.reverse((row['geometry'].y, row['geometry'].x), timeout=10).address
    )

# and add results to the dataframe
cafes['address'] = geocoding_addresses

# improve formatting
cafes['address'] = cafes['address'].apply(lambda x: f'{x.split(", ")[0]}, 5020 Salzburg')

100%|██████████| 123/123 [00:27<00:00,  4.42it/s]


In [99]:
cafes.sample(5)

Unnamed: 0,name,opening_hours,website,geometry,addr:street,addr:housenumber,address
109,kowalsky,,,POINT (13.07714 47.78794),,,"Guggenbichlerstraße, 5020 Salzburg"
85,Natoo,"Mo-Su,PH 07:00-20:00",,POINT (13.04524 47.81319),,,"Südtiroler Platz 1, 5020 Salzburg"
30,Gugelhupf,,,POINT (13.07473 47.79071),,,"Aigner Straße 53, 5020 Salzburg"
90,Ricky's,,,POINT (13.06557 47.78627),,,"Friedensstraße 4, 5020 Salzburg"
40,Cafe No. 5,"Mo-Sa 09:00-17:00; Su 10:00-17:00; We,Th off",https://www.cafeno5-salzburg.at,POINT (13.04380 47.79976),Universitätsplatz,15.0,"Universitätsplatz 15, 5020 Salzburg"


The next issue is that some websites are not reachable. Let's remove them from the data.

In [100]:
for index, row in cafes.iterrows():
  if row['website'] is not None:
    try:
      status_code: int = urllib.request.urlopen(row['website']).getcode()
      # print(f'Status code {status_code} for {row["website"]}')
    except Exception:
      print(f'Could not reach {row["website"]} - index {index}')

  # Coffee press also has closed - I must remove it
  if 'press' in row['name'].lower():
    print(f'{row["name"]} has index {index}')

Fabrizi Espresso has index 0
Could not reach http://www.cafewernbacher.at - index 11
Could not reach http://www.floeckner.at/ - index 36
Could not reach www.central-salzburg.at - index 37
Could not reach http://www.mensen.at/ - index 56
Could not reach https://www.s5er.at/ - index 65
Could not reach https://rosen.cafe/ - index 99
Coffee Press has index 114
Could not reach https://www.cafemus.at/tagesbar/ - index 115
Could not reach https://ratiocoffee.at/ - index 116


In [102]:
rows_to_remove: list[int] = [36, 37, 65, 99, 114]
cafes = cafes.drop(index=rows_to_remove)
print(cafes.shape)

(118, 7)


Finally, we can save this data set and create two empty columns in which I enter my personal recommendations.

In [103]:
cafes['author_rating'] = None
cafes['author_recommendation'] = None
cafes['author_comment'] = None
cafes.to_file('cafes-osm-preprocessed.geojson')