In [365]:
import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup
from pykml import parser
import re
import pgeocode
from geopy.geocoders import Nominatim

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [301]:
with open('Worldwide_Board_Game_Cafe_List.kml','r') as f:
    doc = parser.parse(f).getroot()

In [302]:
url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Europe'
response = requests.get(url)
response

<Response [200]>

In [303]:
soup = BeautifulSoup(response.content, 'lxml')
table = soup.find_all('table', attrs={'class':'wikitable'})[1]
eu_countries = [td.a.text for td in table.find_all('td')[2::7]]

In [304]:
eu_countries.remove('United Kingdom')
eu_countries.extend(['England', 'Scotland', 'Wales', 'Northern Ireland'])
eu_countries.sort()

In [305]:
eu_countries

['Albania',
 'Andorra',
 'Armenia',
 'Austria',
 'Azerbaijan',
 'Belarus',
 'Belgium',
 'Bosnia and Herzegovina',
 'Bulgaria',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'England',
 'Estonia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Greece',
 'Hungary',
 'Iceland',
 'Ireland',
 'Italy',
 'Kazakhstan',
 'Latvia',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Malta',
 'Moldova',
 'Monaco',
 'Montenegro',
 'Netherlands',
 'North Macedonia',
 'Northern Ireland',
 'Norway',
 'Poland',
 'Portugal',
 'Romania',
 'Russia',
 'San Marino',
 'Scotland',
 'Serbia',
 'Slovakia',
 'Slovenia',
 'Spain',
 'Sweden',
 'Switzerland',
 'Turkey',
 'Ukraine',
 'Vatican City',
 'Wales']

In [693]:
cafe_list = []

In [694]:
for e in doc.Document.findall('.//{http://www.opengis.net/kml/2.2}Placemark'):
    cafe_dict = dict()
    if e.ExtendedData.Data[5].value in eu_countries:
        cafe_dict = { 
            'Name': e.name.text,
            'Address': e.ExtendedData.Data[1].value.text,
            'City': e.ExtendedData.Data[2].value.text,
            'PostalCode': e.ExtendedData.Data[4].value.text,
            'Country': e.ExtendedData.Data[5].value.text
                    }
        cafe_list.append(cafe_dict)

In [695]:
for item in cafe_list:
    if item['Country'] == 'Czech Republic':
        item['Country'] = 'Czechia'

In [696]:
cafe_df = pd.DataFrame(cafe_list)
cafe_df.head(10)

Unnamed: 0,Name,Address,City,PostalCode,Country
0,Brot & Spiele,Mariahilferstraße 17,Graz,8020,Austria
1,Brot und Spiele,Laudongasse 22,Vienna,1080,Austria
2,Café Benno,Alser Str. 67,Vienna,1080,Austria
3,Café Sperlhof,Große Sperlgasse 41,Vienna,1020,Austria
4,SpielBar,Lederergasse 26,Vienna,1080,Austria
5,The Playground (Hoofdkerk),Hoofdkerkstraat 7,Antwerp,2000,Belgium
6,The Playground (Station),Pelikaanstraat 3/1270,Antwerp,2018,Belgium
7,Outpost Antwerpen,Beggaardenstraat 6,Antwerpen,2000,Belgium
8,The Playground,Hoofdkerkstraat 7,Antwerpen,2000,Belgium
9,La Luck Brussels,74 rue Washington,Brussels,1050,Belgium


In [310]:
gn_url = 'http://www.geonames.org/countries/'
gn_response = requests.get(gn_url)
gn_soup = BeautifulSoup(gn_response.content, 'lxml')

In [311]:
gn_table = gn_soup.find_all('tr')[2:]
code_list = []
for tr in gn_table:
    code_dict = dict()
    td = tr.find_all('td')
    if td[-1].text in ['EU', 'AS']: # We include AS because GeoNames includes Turkey within Asia
        code_dict = {
            'Country': td[4].text,
            'Code': td[0].text
                    }
        code_list.append(code_dict)


In [312]:
code_df = pd.DataFrame(code_list)

In [313]:
uk_countries = [{'Country': 'England', 'Code': 'GB'}, 
                {'Country': 'Scotland', 'Code': 'GB'}, 
                {'Country': 'Wales', 'Code': 'GB'}, 
                {'Country': 'Northern Ireland', 'Code': 'GB'}]

In [314]:
code_df = pd.concat([code_df, pd.DataFrame(uk_countries)])
code_df.set_index('Country', inplace=True)
code_df.drop('United Kingdom', axis=0, inplace=True)
code_df.sort_index()

Unnamed: 0_level_0,Code
Country,Unnamed: 1_level_1
Afghanistan,AF
Albania,AL
Andorra,AD
Armenia,AM
Austria,AT
Azerbaijan,AZ
Bahrain,BH
Bangladesh,BD
Belarus,BY
Belgium,BE


In [697]:
cafe_df = cafe_df.merge(code_df, how='left', on='Country')
cafe_df.head(10)

Unnamed: 0,Name,Address,City,PostalCode,Country,Code
0,Brot & Spiele,Mariahilferstraße 17,Graz,8020,Austria,AT
1,Brot und Spiele,Laudongasse 22,Vienna,1080,Austria,AT
2,Café Benno,Alser Str. 67,Vienna,1080,Austria,AT
3,Café Sperlhof,Große Sperlgasse 41,Vienna,1020,Austria,AT
4,SpielBar,Lederergasse 26,Vienna,1080,Austria,AT
5,The Playground (Hoofdkerk),Hoofdkerkstraat 7,Antwerp,2000,Belgium,BE
6,The Playground (Station),Pelikaanstraat 3/1270,Antwerp,2018,Belgium,BE
7,Outpost Antwerpen,Beggaardenstraat 6,Antwerpen,2000,Belgium,BE
8,The Playground,Hoofdkerkstraat 7,Antwerpen,2000,Belgium,BE
9,La Luck Brussels,74 rue Washington,Brussels,1050,Belgium,BE


In [355]:
def get_coords(row):
    geo = pgeocode.Nominatim(row[-1])
    coords = geo.query_postal_code(row[3])
    return [coords.latitude, coords.longitude]

In [356]:
coord_list = []

In [357]:
for row in cafe_df.values:
    coord_dict = dict()
    try:
        ll = get_coords(row)
    except:
        ll = [np.nan, np.nan]
    coord_dict = {
        'Latitude': ll[0], 
        'Longitude': ll[1]         
                 }
    coord_list.append(coord_dict)

In [358]:
coord_df = pd.DataFrame(coord_list)
coord_df.head(10)

Unnamed: 0,Latitude,Longitude
0,47.0232,15.5337
1,48.2167,16.35
2,48.2167,16.35
3,48.2167,16.4
4,48.2167,16.35
5,51.2199,4.4035
6,51.2199,4.4035
7,51.2199,4.4035
8,51.2199,4.4035
9,50.8333,4.3667


Let's check to see if there are any cafés that could not provide coordinates.

In [359]:
null_idx = coord_df.index[coord_df['Latitude'].isnull()].tolist()
null_cafes = cafe_df.iloc[null_idx]
null_cafes

Unnamed: 0,Name,Address,City,PostalCode,Country,Code
23,3 Trolls,"ul. ""Oborishte"" 80",Sofia,1505 Oborishte,Bulgaria,BG
42,The Games Table,"86 Magdalen St, Norwich",Norwich,NR311JF,England,GB
73,Dice Saloon,"Unit 6, Longley Industrial Estate, New England...",Brighton,BN14GY,England,GB
124,Game of trolls,8 rue de Paris,Moulins,3000,France,FR
192,Playhouse,Valtetsiou 49,Athens,106 81,Greece,GR
193,Κάισσα Cafe,"Μεσογείων 12, Αμπελόκηποι",Athens,11527,Greece,GR
194,Playce,Vasili Logothetidi 14,Athina,115 24,Greece,GR
195,Playhouse,Sakellariou 21,Ioannina,453 33,Greece,GR
196,Playhouse,Poulidou 6,Kavala,652 01,Greece,GR
197,Playhouse,Panagouli 16,Larisa,412 22,Greece,GR


Well, that's a number of missing data pieces. On a hunch, let's first count the number of cafés per country in both `cafe_df` and `null_cafes`.

In [323]:
cafe_df.groupby('Country').count()

Unnamed: 0_level_0,Name,City,PostalCode,Code
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austria,5,5,5,5
Belgium,15,15,15,15
Bulgaria,4,4,4,4
Czechia,3,3,3,3
Denmark,11,11,11,11
England,78,78,78,78
Finland,2,2,2,2
France,57,57,57,57
Germany,17,17,17,17
Greece,11,11,11,11


In [324]:
null_cafes.groupby('Country').count()

Unnamed: 0_level_0,Name,City,PostalCode,Code
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bulgaria,1,1,1,1
England,2,2,2,2
France,1,1,1,1
Greece,11,11,11,11
Ireland,1,1,1,1
Italy,1,1,1,1
Moldova,1,1,1,1
Netherlands,8,8,8,8
Serbia,2,2,2,2
Sweden,1,1,1,1


By comparing these numbers, we can see that every café in Greece, Moldova and Serbia are missing, which suggests that `pgeocode` does not contain information for these countries (and a check of the [github page](https://github.com/symerio/pgeocode) for `pgeocode` confirms this). This will require utilising a different source for the information.
Otherwise, the postal codes for the remaining entries of `null_cafes` must contain errors and need to be manually changed.

So, what is next to sort out is a) making sure we can obtain the coordinates for Greece, Moldova and Serbia (and Netherlands)~~; and b) remove any duplicate entries within `cafe_df`~~.

Starting with the duplicates...

(In addition, we need to change the city names to match---e.g. issues within Belgium such as Antwerpen vs. Antwerp and Bruxelles and Brussels---in order to later group things by city.)

In [414]:
cafe_df

Unnamed: 0,Name,Address,City,PostalCode,Country,Code
0,Brot & Spiele,Mariahilferstraße 17,Graz,8020,Austria,AT
1,Brot und Spiele,Laudongasse 22,Vienna,1080,Austria,AT
2,Café Benno,Alser Str. 67,Vienna,1080,Austria,AT
3,Café Sperlhof,Große Sperlgasse 41,Vienna,1020,Austria,AT
4,SpielBar,Lederergasse 26,Vienna,1080,Austria,AT
5,The Playground (Hoofdkerk),Hoofdkerkstraat 7,Antwerp,2000,Belgium,BE
6,The Playground (Station),Pelikaanstraat 3/1270,Antwerp,2018,Belgium,BE
7,Outpost Antwerpen,Beggaardenstraat 6,Antwerpen,2000,Belgium,BE
8,The Playground,Hoofdkerkstraat 7,Antwerpen,2000,Belgium,BE
9,La Luck Brussels,74 rue Washington,Brussels,1050,Belgium,BE


In [698]:
cafe_df.drop(23, axis=0, inplace=True) # A search reveals that 3 Trolls in Bulgaria is permanently closed
cafe_df.loc[42, 'PostalCode'] = 'NR2 1EL'
cafe_df.loc[73, 'PostalCode'] = 'BN1 4JF'
cafe_df.loc[124, 'PostalCode'] = '03000'
cafe_df.loc[211, 'PostalCode'] = 'P75 XW35'
cafe_df.drop(220, axis=0, inplace=True) # This is in fact related to an event called Counters in Pontypridd, Wales, not Italy
cafe_df.loc[223, 'PostalCode'] = 'MD-2012'
cafe_df.loc[230, 'PostalCode'] = '9712 NP'
cafe_df.loc[231, 'PostalCode'] = '2011 LE'
cafe_df.loc[286, 'PostalCode'] = '411 19'
cafe_df.loc[291, 'PostalCode'] = '06490'

cafe_df.loc[7, 'City'] = 'Antwerp'
cafe_df.drop(8, axis=0, inplace=True)
cafe_df.loc[12, 'City'] = 'Brussels'
cafe_df.loc[16, 'Address'] = 'Rue Hors-Château 43'
cafe_df.loc[17, 'Address'] = 'Place Abbé Joseph André 11'
cafe_df.drop(33, axis=0, inplace=True)
cafe_df.loc[38, 'Address'] = 'Rushden Rd' # This is as accurate as Nominatim can get
cafe_df.drop(43, axis=0, inplace=True)
cafe_df.drop(44, axis=0, inplace=True)
cafe_df.loc[48, 'Address'] = '247 High Street'
cafe_df.drop(50, axis=0, inplace=True)
cafe_df.drop(72, axis=0, inplace=True)
cafe_df.loc[73, 'Address'] = '88 London Rd'
cafe_df.drop(74, axis=0, inplace=True)
cafe_df.loc[75, 'Address'] = '' # Needed in order for geopy to obtain geographical data
cafe_df.drop(79, axis=0, inplace=True)
cafe_df.loc[82, 'Address'] = 'Abinger place'
cafe_df.loc[84, 'Address'] = '207 Queensway'
cafe_df.loc[84, 'PostalCode'] = 'MK2 2EB'
cafe_df.loc[86, 'Address'] = '149 Albert Rd'
cafe_df.loc[89, 'Address'] = 'The Brooks Centre'
cafe_df.loc[103, 'Address'] = '19a Pepper Street'
cafe_df.loc[103, 'PostalCode'] = 'ST5 1PR'
cafe_df.loc[104, 'Name'] = 'Nerdy Coffee Co.'
cafe_df.loc[185, 'Address'] = 'Lehener Straße 15'
cafe_df.loc[194, 'Address'] = 'Λογοθετίδη Βασίλη 14'
cafe_df.loc[194, 'City'] = 'Athens'
cafe_df.loc[196, 'PostalCode'] = '65302'
cafe_df.loc[197, 'PostalCode'] = '41221'
cafe_df.loc[198, 'Address'] = 'Δημητριου Ράλλη 4'
cafe_df.loc[200, 'Address'] = 'Γεωργίου Παπανδρέου 27'
cafe_df.loc[200, 'PostalCode'] = '54645'
cafe_df.loc[202, 'Address'] = 'Βασιλίσσης Σοφίας'
cafe_df.loc[205, 'Address'] = 'Ferenc körút 17'
cafe_df.drop(206, axis=0, inplace=True)
cafe_df.loc[207, 'Name'] = 'Pub Game Up!'
cafe_df.loc[211, 'Address'] = '9 High Street'
cafe_df.drop(212, axis=0, inplace=True)
cafe_df.loc[213, 'Address'] = '51 Wellington Quay'
cafe_df.loc[214, 'PostalCode'] = 'D02 FP40'
cafe_df.loc[215, 'PostalCode'] = 'H91 Y90F'
cafe_df.loc[216, 'Address'] = 'Via Giuseppe Toniolo 12'
cafe_df.loc[223, 'Address'] = 'Strada Alexandr Pușkin 52'
cafe_df.loc[233, 'City'] = 'Skopje'
cafe_df.loc[234, 'Address'] = 'Rosepark, Upper Newtownards Road' # This is as accurate as Nominatim allows
cafe_df.loc[234, 'PostalCode'] = 'BT4 3SB'
cafe_df.loc[235, 'Address'] = 'Holywood Road'
cafe_df.loc[235, 'PostalCode'] = 'BT4 1NT'
cafe_df.loc[238, 'Address'] = 'Dmowskiego 15'
cafe_df.loc[239, 'Address'] = 'Kamienna 7'
cafe_df.loc[248, 'Name'] = 'Ludoclube'
cafe_df.loc[248, 'PostalCode'] = '2720-046'
cafe_df.loc[249, 'Name'] = 'Pow Wow'
cafe_df.loc[249, 'Address'] = 'Rua Professor Fernando da Fonseca 19'
cafe_df.loc[249, 'PostalCode'] = '1600-235'
cafe_df.loc[250, 'Name'] = 'A Jogar é que a gente se entende'
# I don't know how to fix the address for A Jogar; Nominatim can go fuck itself.
cafe_df.loc[252, 'Name'] = 'Snakes & Wizards'
cafe_df.loc[252, 'Address'] = 'Strada Ilarie Chendi 5'
cafe_df.loc[253, 'Address'] = 'Strada Samuil Micu 4'
cafe_df.loc[256, 'Name'] = 'FatCats Board Game Cafe'
cafe_df.loc[256, 'PostalCode'] = '100337'
cafe_df.drop(264, axis=0, inplace=True)
cafe_df.drop(272, axis=0, inplace=True)
cafe_df.loc[280, 'Address'] = "Carrer de l'Alandir 1" # The actual address "Carrer Hospitalers de Sant Joan n.2" is a footpath so doesn't show
cafe_df.loc[282, 'Address'] = 'Av. Manuel Torres 5' # In Nominatim the 'de' given in Google Map returns an error
cafe_df.loc[284, 'Address'] = 'Carrer de Rosselló i Cazador, 7'
cafe_df.loc[292, 'Address'] = 'Nail Bey Sk. No:48/2' # In Nominatim the street name is Nail Bey, not Nailbey as with Google Maps

In checking `cafe_df` we can see that cafés are not yet group by cities, so let's do that and then reset the indices.

In [699]:
cafe_df.sort_values(['Country', 'City'], inplace=True)
cafe_df.reset_index(drop=True, inplace=True)
cafe_df.head()

Unnamed: 0,Name,Address,City,PostalCode,Country,Code
0,Brot & Spiele,Mariahilferstraße 17,Graz,8020,Austria,AT
1,Brot und Spiele,Laudongasse 22,Vienna,1080,Austria,AT
2,Café Benno,Alser Str. 67,Vienna,1080,Austria,AT
3,Café Sperlhof,Große Sperlgasse 41,Vienna,1020,Austria,AT
4,SpielBar,Lederergasse 26,Vienna,1080,Austria,AT


In [364]:
cafe_df.to_csv('cafe_addresses.csv', index=False)

In [366]:
geolocator = Nominatim(user_agent='bgc_finder')

In [402]:
cafe_df.values[0]

array(['Brot & Spiele', 'Mariahilferstraße 17', 'Graz', '8020', 'Austria',
       'AT'], dtype=object)

In [700]:
ll_list = []

In [524]:
location = geolocator.geocode('Nailbey Sk. No. 48, 34714, Turkey') 
location.latitude

AttributeError: 'NoneType' object has no attribute 'latitude'

In [403]:
def get_ll(row):
    location = geolocator.geocode(f"{row[1]}, {row[3]} {row[2]}, {row[4]}")
    return [location.latitude, location.longitude]

In [701]:
for row in cafe_df.values:
    ll_dict = dict()
    try:
        ll = get_ll(row)
    except:
        ll = [np.nan, np.nan]
    ll_dict = {'Latitude': ll[0], 'Longitude': ll[1]}
    ll_list.append(ll_dict)

In [702]:
ll_df = pd.DataFrame(ll_list)
ll_df.head()

Unnamed: 0,Latitude,Longitude
0,47.073272,15.433036
1,48.213407,16.349799
2,48.21505,16.342587
3,48.219658,16.37838
4,48.213688,16.348476


In [703]:
null_idx = ll_df.index[ll_df['Latitude'].isnull()].tolist()
cafe_df.loc[null_idx,:]

Unnamed: 0,Name,Address,City,PostalCode,Country,Code
212,Spellencentrum Agartha,Legmeerdijk 169a,Aalsmeer,1432 KA,Netherlands,NL
213,2 Klaveren,De Clercqstraat 136,Amsterdam,1052 NP,Netherlands,NL
215,Escaping Breda,"Boschstraat 112 -116,",Breda,4811 GK,Netherlands,NL
216,Barbaar,Sint Agathaplein 4,Delft,2611 HR,Netherlands,NL
217,Bordspelers,Lange Groenendaal 82,Gouda,2801 LV,Netherlands,NL
218,Purperen Draak,Nieuwe Ebbingestraat 86a,Groningen,9712 NP,Netherlands,NL
220,Spellenhuis,Torenstraat 142,The Hague,2513 BW,Netherlands,NL
222,Jack Straws,"Rosepark, Upper Newtownards Road",Belfast,BT4 3SB,Northern Ireland,GB
223,Robins Hobby Cafe,Holywood Road,Belfast,BT4 1NT,Northern Ireland,GB
238,A Jogar é que a gente se entende,Rua Joaquim Maria de Melo 244,Vila do Conde,4480-002,Portugal,PT


In [704]:
len(null_idx)

10

In [537]:
cafe_df[cafe_df['Country'] == 'Spain']

Unnamed: 0,Name,Address,City,PostalCode,Country,Code
265,Kaburi Cafe,"Passeig de Sant Joan, 11",Barcelona,8010,Spain,ES
266,Continental Cafe Pub,"C/ Seminario, 11",Granada,18011,Spain,ES
267,La Pifia,C/ Hospitalers de Sant Joan n.2 bajos,Lleida,25005,Spain,ES
268,Epic Board Game Cafe,Calle Los Vascos 3,Madrid,28040,Spain,ES
269,Comejuega,"Av. de Manuel Torres, 5",Málaga,29003,Spain,ES
270,Play Planet Coffe & Shop,Calle San Lorenzo Street 18,Málaga,29001,Spain,ES
271,GTS Mallorca,"Carrer Rosselló i Caçador, 7",Palma De Mallorca,7004,Spain,ES
272,Bruixes i Fades,"Carrer de Rubén Darío, 18",Valencia,46021,Spain,ES
