In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup # For web scraping

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
cafe_df = pd.read_csv('cafe_addresses.csv')
cafe_df.head(10)

Unnamed: 0,Name,Address,City,PostalCode,Country,Latitude,Longitude
0,Brot & Spiele,Mariahilferstraße 17,Graz,8020,Austria,47.073272,15.433036
1,Brot und Spiele,Laudongasse 22,Vienna,1080,Austria,48.213407,16.349799
2,Café Benno,Alser Str. 67,Vienna,1080,Austria,48.21505,16.342587
3,Café Sperlhof,Große Sperlgasse 41,Vienna,1020,Austria,48.219658,16.37838
4,SpielBar,Lederergasse 26,Vienna,1080,Austria,48.213688,16.348476
5,The Playground (Hoofdkerk),Hoofdkerkstraat 7,Antwerp,2000,Belgium,51.221243,4.403738
6,The Playground (Station),Pelikaanstraat 3/1270,Antwerp,2018,Belgium,51.216644,4.420792
7,Outpost Antwerpen,Beggaardenstraat 6,Antwerp,2000,Belgium,51.21951,4.404468
8,La Luck Brussels,74 rue Washington,Brussels,1050,Belgium,50.823495,4.363401
9,La Table Food & Games,63 rue de l'enseignement,Brussels,1000,Belgium,50.848845,4.365281


In [3]:
city_list = []
for n in range(1, 6):
    url = f'http://www.citymayors.com/features/euro_cities{n}.html'
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    table = soup.find_all('table')[1]
    city_table = [td.text.title().rstrip() for td in table.find_all('td', attrs={'width':'140'})[1:]]
    city_list.extend(city_table)

Unfortunately, the data on City Mayors omits Turkey, and so we need to collect this information from [Wikipedia](https://en.wikipedia.org/wiki/List_of_largest_cities_and_towns_in_Turkey). The smallest population value given in the City Mayors table is 149,000 people, and so we will include all cities whose population exceed this value.

In addition, a number of English counties, rather than cities are within the data, so we shall remove those, and we need to include 'Rome' instead of 'Roma' to ensure it is properly treated.

In [4]:
city_list = list(map(lambda x: x.replace('Roma', 'Rome'), city_list))

turkey_url = 'https://en.wikipedia.org/wiki/List_of_largest_cities_and_towns_in_Turkey'
turkey_resp = requests.get(turkey_url)
turkey_soup = BeautifulSoup(turkey_resp.content, 'lxml')
turkey_table = turkey_soup.find('table', attrs={'class': 'sortable'})
city_pop = [td.text.rstrip('\n').replace(',', '') for td in turkey_table.find_all('td')[6::8]]
turkey_cities = [a.text for a in turkey_table.find_all('a')[::2]]

In [5]:
for city, pop in zip(turkey_cities, city_pop):
    try:
        if int(pop) > 149000:
            city_list.append(city)
    except:
        pass
print(f'city_list now contains {len(city_list)} cities')

city_list now contains 543 cities


In [6]:
for city in city_list:
    if 'shire' in city:
        city_list.remove(city)
print(f'city_list now contains {len(city_list)} cities')

city_list now contains 534 cities


In [7]:
cafe_city_list = cafe_df['City'].unique().tolist()
cafe_city_list.sort()
#cafe_city_list

In [8]:
city_list.sort()
#city_list

In [9]:
city_without_cafe = []

In [10]:
def has_a_cafe(city):
    inclusion = [(cafe.lower() in city.lower()) for cafe in cafe_city_list]
    return any(inclusion)

In [11]:
for city in city_list:
    if not has_a_cafe(city):
        city_without_cafe.append(city)
print(f'\ncity_without_cafe has {len(city_without_cafe)} cities')


city_without_cafe has 439 cities


In [12]:
city_without_cafe[:10]

['Aachen',
 'Abakan',
 'Aberdeen',
 'Adana',
 'Adapazarı',
 'Adıyaman',
 'Afyon',
 'Aksaray',
 'Alcalá De Henares',
 'Alicante']

In [13]:
cwc_df = pd.DataFrame(city_without_cafe, columns=['City'])
cwc_df.head()

Unnamed: 0,City
0,Aachen
1,Abakan
2,Aberdeen
3,Adana
4,Adapazarı


In [14]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='bgc_finder')
ll_list = []

In [15]:
def get_ll(row):
    location = geolocator.geocode(row)
    try:
        return [location.latitude, location.longitude]
    except:
        return [np.nan, np.nan]

In [16]:
cwc_df.loc[34] = 'Bila Tserkva'
cwc_df.loc[80] = 'Chernivtsi'
cwc_df.loc[90] = 'Kamianske'
cwc_df.loc[100] = 'Yekaterinburg'
cwc_df.loc[135] = 'Yoshkar-Ola'
cwc_df.loc[177] = 'Kremenchuk'
cwc_df.loc[209] = 'Makiivka'
cwc_df.loc[243] = 'Nizhnevartovsk'
cwc_df.loc[259] = 'Oldham, Greater Manchester'
cwc_df.loc[280] = 'Piraeus'
cwc_df.drop(294, axis=0, inplace=True) # Rhondda Cynon Taf is a Welsh county whose largest town has less than 31000 people
cwc_df.loc[345] = 'Stary Oskol'
cwc_df.loc[356] = 'Syktyvkar'
cwc_df.drop(381, axis=0, inplace=True) # We perhaps don't want Turku within our training data
cwc_df.loc[390] = 'Yuzhno-Sakhalinsk'
cwc_df.loc[393] = 'Van, İpekyolu'
cwc_df.loc[401] = 'Vinnytsia'
cwc_df.loc[421] = 'Yaroslavl'

In [17]:
for row in cwc_df.to_numpy():
    city = row[0]
    ll_dict = dict()
    ll = get_ll(city)
    ll_dict = {'City': city, 'Latitude': ll[0], 'Longitude': ll[1]}
    ll_list.append(ll_dict)

# Build the DataFrame
ll_df = pd.DataFrame(ll_list)
ll_df.head()

Unnamed: 0,City,Latitude,Longitude
0,Aachen,50.776351,6.083862
1,Abakan,53.720661,91.440369
2,Aberdeen,57.148243,-2.092809
3,Adana,36.993617,35.325835
4,Adapazarı,40.784799,30.399683


In [18]:
null_idx = ll_df.index[ll_df['Latitude'].isnull()].tolist()
print(f'There are {len(null_idx)} rows missing data.')

if len(null_idx) > 0:
    ll_df.loc[null_idx,:]

There are 0 rows missing data.


Because of the size of Russia, there are a number of cities that are contained within Asia. As such, we will drop any cities that are East of the Ural mountains (whose longitude according to [Wikipedia](https://en.wikipedia.org/wiki/Ural_Mountains) is 60E).

In [19]:
asia_idx = ll_df[ll_df['Longitude'] > 60].index
ll_df.drop(asia_idx, inplace=True)
ll_df.reset_index(drop=True, inplace=True)
ll_df.head()

Unnamed: 0,City,Latitude,Longitude
0,Aachen,50.776351,6.083862
1,Aberdeen,57.148243,-2.092809
2,Adana,36.993617,35.325835
3,Adapazarı,40.784799,30.399683
4,Adıyaman,37.78936,38.31411


In [20]:
ll_df.to_csv('cities_without_cafes.csv', index=False)