In [1]:
# developed in Python 3
!python --version

Python 3.12.4


# Options

In [2]:
# country ISO A2 code
COUNTRY = 'CZ'

In [3]:
# min, max population of country cities to be included in GeoWeb data
CITY_POP_MIN = 10000
CITY_POP_MAX = 100000000 #any large city

# from,to time range of webpages seen online to be included in GeoWeb data
YEAR_FROM = 2023
YEAR_TO = 2024

In [4]:
# optional cache for functions
CACHE_ENABLED = True

# Initialization

In [5]:
# these modules are used in 
# geo-web-method geo-web-demo geo-web-results
!pip install -r requirements.txt --disable-pip-version-check | tail -n 5



In [6]:
import json,copy,random,operator,sys,os,re

import numpy as np
import pandas as pd
import shapely as sh
import humanize as hm
import geopandas as gpd
import matplotlib.pyplot as plt

import geonamescache as gn
gn_cache = gn.GeonamesCache()

import folium
from IPython.display import HTML

import requests
from bs4 import BeautifulSoup

from requests_ratelimiter import LimiterSession
session = LimiterSession(per_second=1)
session.headers.update({'User-Agent': 'geo-web','email':'utko@vut.cz'})

from tqdm import tqdm
tqdm.pandas(file=sys.stdout,ncols=100)

from joblib import Memory
memory = Memory(location=f'__CACHE/{COUNTRY}' if CACHE_ENABLED else None,verbose=0)

In [7]:
# global websites 
# https://majestic.com/reports/majestic-million?domain=&majesticMillionType=0&tld=&oq=&canUseDefault=
global_roots = set()

with open('global-websites/global-websites.csv', 'r') as f:
    for line in f:
        columns = line.strip().split(',')
        global_roots.add(columns[2])

exclude_pages = {'robots.txt', 'sitemap.xml', '.htaccess','favicon.ico', 'humans.txt'}

# Functions

### REST_API_CALL

In [8]:
# we do not set a timeout for requests as REST API calls take a long time on their own
# we wait for the response or throw an exception if there is a connection error

In [9]:
@memory.cache
def REST_API_CALL(url):

    response = pd.NA
    
    try:
        response = session.get(url)
    except:
        raise UserWarning('REST API connection error.')
    
    return response

url = 'https://nominatim.openstreetmap.org/search?city=Paris&country=FR&format=json'
%time REST_API_CALL(url)

CPU times: user 0 ns, sys: 480 µs, total: 480 µs
Wall time: 396 µs


<Response [200]>

## parse_url

In [10]:
def parse_url(url):
    '''
    Parses domain, root domain, and endpath from URL
    
    Verifies URL to be valid

    :param str url: url to parse
    :return: domain, path, root domain
    :rtype: dict
    '''
    
    # remove protocol
    url = url.removeprefix('https://')
    url = url.removeprefix('http://')

    domain = pd.NA; path = pd.NA; root = pd.NA

    # domain and path
    parsed = url.split('/',1)     
    domain = parsed[0]

    # allowed chars alpha, numbers and .-
    if ('.' not in domain or 
        not domain.replace('.','').replace('-','').isalnum()): 
        
        domain = pd.NA
        
    if pd.notna(domain) and len(parsed) == 2:  
        path = parsed[1]
        if path == '': path = pd.NA

    # other domain levels
    if pd.notna(domain):
        parsed = domain.rsplit('.',2)[-2:]
        root = '.'.join(parsed)

    return {'domain':domain,'path':path,'root':root}
    
assert parse_url(url= 'https://www.overleaf.com/user/subscription/plans') == {
    'domain': 'www.overleaf.com', 'path': 'user/subscription/plans',
    'root': 'overleaf.com'}

parse_url(url= 'https://www.overleaf.com/user/subscription/plans')

{'domain': 'www.overleaf.com',
 'path': 'user/subscription/plans',
 'root': 'overleaf.com'}

## list_country_cities

In [11]:
def list_country_cities(country):
    '''
    Lists cities in country

    Min city pop is 15000

    :param str country: country code ISO A2
    :return: cities; sorted by population
    :rtype list
    '''

    result = gn_cache.get_cities()
    cities = []

    for city in result:
        if (result[city]['countrycode'] == country and 
            result[city]['population'] > CITY_POP_MIN and 
            result[city]['population'] < CITY_POP_MAX):
            
            cities.append({'name':result[city]['name'],
                'pop':result[city]['population']})

    cities.sort(key=operator.itemgetter('pop'),reverse = True)
    cities = [city['name'] for city in cities]

    if len(cities) == 0: cities = pd.NA

    return cities
    
%time list_country_cities(country='FR')[:5]

CPU times: user 48.9 ms, sys: 15.9 ms, total: 64.9 ms
Wall time: 64.7 ms


['Paris', 'Marseille', 'Lyon', 'Toulouse', 'Nice']

## city_to_area_id

In [12]:
def city_to_area_id(city,country):
    '''
    Resolves city to OSM area ID

    :param str city: city
    :param str country: country
    :return: OSM area ID
    :rtype int
    '''
    
    area_id = pd.NA

    url = f'https://nominatim.openstreetmap.org/search?city={city}&country={country}&format=json'
    
    response = REST_API_CALL(url)
    
    if response.status_code == 200:
        
        try:
            response_data = json.loads(response.text)
            
            # find first relation-type record
            for record in response_data:    
                if record['osm_type'] == 'relation':
                    area_id = record['osm_id'] + 3600000000; break
        except:
            raise UserWarning('REST API data error.')
            
    else:      
        #print('\nREST API request error.')
        None

    return area_id

%time city_to_area_id(city='Paris',country='FR')

CPU times: user 0 ns, sys: 912 µs, total: 912 µs
Wall time: 769 µs


3600071525

## get_city_webpages

In [13]:
def get_city_webpages(area):
    '''
    Gets websites of city entities

    :param str area_id: city area
    :return: webpage, lat, lon
    :rtype list of dict
    '''

    websites = []

    query = f'''[out:csv(website,::lat,::lon)][timeout:300];
    area({area}) ->.city;
    (
    node['website'](area.city);
    way['website'](area.city);
    );
    out center;'''

    url = f'https://overpass-api.de/api/interpreter?data={query}'

    response = REST_API_CALL(url)

    if response.status_code == 200:
        
        try:
            response_data = response.text.split('\n')[1:-1]
    
            for record in response_data:
                    try:
                        url,url_lat,url_lon = record.split('\t')
                        websites.append({'webpage':url,'lat':url_lat,'lon':url_lon})
                    except:
                        # skip wrong record
                        pass
        except:
            #print('\nREST API data error.')
            pass


    else:
        #print('\nREST API request error.')
        None
   
    if len(websites) == 0: websites = pd.NA

    return websites
    
#3600007444 Paris, FR
%time get_city_webpages(area=3600007444)[:2]

CPU times: user 30.8 ms, sys: 0 ns, total: 30.8 ms
Wall time: 30.5 ms


[{'webpage': 'https://www.batobus.com/fr/station/notre-dame',
  'lat': '48.8518340',
  'lon': '2.3500556'},
 {'webpage': 'https://store.totalenergies.fr/en_EN/NF078037',
  'lat': '48.8346767',
  'lon': '2.2661460'}]

## get_website_pages

In [14]:
# calculate geo-web page limit for a local website
# 100 pages/year
geoweb_page_limit = (YEAR_TO - YEAR_FROM)*100

def get_website_pages(website,year_from,year_to):
    '''
    Gets website subdomains and endpages

    :param str website: website
    :param int year_from: first year when pages were seen online (archived)
    :param int year_to: last year when pages were seen online (archived)
    :return: webpage, date
    :rtype: list of dict
    '''

    pages = []

    query = f'*.{website}&output=json&filter=statuscode:200&filter=mimetype:text/html&from={year_from}&to={year_to}&fl=timestamp,original&collapse=urlkey&limit={geoweb_page_limit+1}'
        
    url = f'http://web.archive.org/cdx/search/cdx?url={query}'

    response = REST_API_CALL(url)

    if response.status_code == 200:
        
        try:
            response_data = json.loads(response.text)[1:]
                
            try:
                for record in response_data:
                        web_time = record[0]; web_url = record[1]
                        pages.append({'web':web_url,'online':web_time})                    
            except:
                # skip wrong record
                pass
            
        except:
            #print('\nREST API data error.')
            pass

    else:    
        #print('\nREST API request error.')
        pass

    
    if len(pages) == 0: pages = pd.NA
        
    return pages

%time get_website_pages(website='www.vut.cz',year_from=2023,year_to=2024)[:2]

CPU times: user 724 µs, sys: 0 ns, total: 724 µs
Wall time: 787 µs


[{'web': 'https://www.vut.cz/', 'online': '20230101115215'},
 {'web': 'https://www.vut.cz/120', 'online': '20240304145346'}]

In [15]:
def frame_info(frame,samples=5):

    size = len(frame)  
    
    frame_sample = frame.sample(min(samples,size))
    display(frame_sample)

    mem = frame.memory_usage(deep=True).sum()
    
    print(f'Size {size}, Memory {hm.naturalsize(mem)}')
    print(frame.dtypes)

    return

# Main processing

## Country cities

In [16]:
data = pd.DataFrame([COUNTRY],columns = ['country'])

# list cities in country
data['cities'] = data.apply(
    lambda row: list_country_cities(country = row.country),axis = 1)

data = data.drop(data.loc[data.cities.isna()].index)

if len(data) == 0: 
    raise UserWarning('No cities found. Change city list options.')

data = data.explode('cities',ignore_index = True).rename(columns = {'cities':'city'})
data = data.drop_duplicates(subset = 'city')

# memory
data.country = data.country.astype('category')
data.city = data.city.astype('category')

frame_info(data)

Unnamed: 0,country,city
5,CZ,Olomouc
37,CZ,Písek
62,CZ,Kutná Hora
90,CZ,Bílina
95,CZ,Klášterec nad Ohří


Size 96, Memory 10.3 kB
country    category
city       category
dtype: object


In [17]:
# resolve cities to OSM area ID
data['area'] = data.progress_apply(
    lambda row:city_to_area_id(city = row.city,country = row.country),axis = 1)

data = data.drop(data.loc[data.area.isna()].index)

if len(data) == 0: 
    raise UserWarning('No cities found. Check REST API service.')

frame_info(data)

100%|█████████████████████████████████████████████████████████████| 96/96 [00:00<00:00, 5523.59it/s]


Unnamed: 0,country,city,area
54,CZ,Strakonice,3600425590
20,CZ,Jihlava,3600440427
71,CZ,Otrokovice,3600440877
36,CZ,Trutnov,3600440154
26,CZ,Česká Lípa,3600440197


Size 96, Memory 11.1 kB
country    category
city       category
area          int64
dtype: object


## City webpages

In [18]:
# get city websites
data['webpages_meta'] = data.progress_apply(
    lambda row: get_city_webpages(area = row.area),axis = 1)

data = data.drop(data.loc[data.webpages_meta.isna()].index)

if len(data) == 0:
    raise UserWarning('No city websites found. Check REST API service or change options.')

data.pop('area')

data = data.explode('webpages_meta',ignore_index=True).rename(
    columns = {'webpages_meta':'webpage_meta'})

data = pd.concat([data,pd.json_normalize(data.webpage_meta)],axis = 1)
data.pop('webpage_meta')

# memory
data.lat = data.lat.astype(pd.StringDtype('pyarrow'))
data.lon = data.lon.astype(pd.StringDtype('pyarrow'))

frame_info(data)

100%|█████████████████████████████████████████████████████████████| 96/96 [00:00<00:00, 4349.43it/s]


Unnamed: 0,country,city,webpage,lat,lon
9651,CZ,Brno,https://autoskolapelikan.cz/,49.205332,16.5940178
9694,CZ,Brno,https://www.zasilkovna.cz/pobocky/z-box-brno-l...,49.2020767,16.6685081
7061,CZ,Prague,https://www.palacove-zahrady.cz/,50.0908512,14.4048779
16160,CZ,Karlovy Vary,https://www.pensionrainbow.cz/cs/,50.2440342,12.8633133
20820,CZ,Rakovník,http://penzion-bezdekov.cz/,50.1037258,13.7422697


Size 21558, Memory 2.6 MB
country           category
city              category
webpage             object
lat        string[pyarrow]
lon        string[pyarrow]
dtype: object


In [19]:
# do not include global websites and drop duplicate
data['webroot'] = data.webpage.progress_apply(lambda x: parse_url(x)['root'])
data.pop('webpage')

data = data.drop(data.loc[data.webroot.isna()].index)

# do not include global websites
data.webroot = data.webroot.str.lower()
todrop = data.loc[data.webroot.isin(global_roots)]
data = data.drop(todrop.index)

# assure we only one website instance 
data = data.drop_duplicates(subset='webroot',keep=False)

# memory
data.webroot = data.webroot.astype(pd.StringDtype('pyarrow'))

frame_info(data)

100%|█████████████████████████████████████████████████████| 21558/21558 [00:00<00:00, 980766.71it/s]


Unnamed: 0,country,city,lat,lon,webroot
2290,CZ,Prague,50.1002331,14.4243957,cheesyandko.cz
12551,CZ,České Budějovice,48.949235,14.5022154,hotellaguna.cz
4317,CZ,Prague,50.1051888,14.4484318,mmproduction.eu
8347,CZ,Brno,49.192488,16.6080272,tripoli.cz
3380,CZ,Prague,50.0795897,14.4183065,zeleznakoule.cz


Size 9202, Memory 637.1 kB
country           category
city              category
lat        string[pyarrow]
lon        string[pyarrow]
webroot    string[pyarrow]
dtype: object


## Website pages

In [20]:
# search endpages of local websites
data['webpages'] = data.progress_apply(
    lambda row:get_website_pages(website = row.webroot,year_from = YEAR_FROM,year_to = YEAR_TO),axis = 1)

# drop websites that do not have any pages
data = data.drop(data.loc[data.webpages.isna()].index)

# drop non-local websites; empirically defined
# to have more than 100 archived pages/year
todrop = data.loc[data.webpages.apply(len) > geoweb_page_limit]
data = data.drop(todrop.index)

if len(data) == 0:
    raise UserWarning('No website pages found. Check REST API service or change options.')

data = data.explode('webpages',ignore_index=True).rename(columns = {'webpages':'webpage'})
data = pd.concat([data,pd.json_normalize(data.webpage)],axis = 1)
data.pop('webpage'); data.pop('webroot')
data = data.drop(data.loc[data.web.isna()].index)

# memory
data.web = data.web.astype(pd.StringDtype('pyarrow'))
data.online = data.online.astype(pd.StringDtype('pyarrow'))

frame_info(data)

100%|█████████████████████████████████████████████████████████| 9202/9202 [00:01<00:00, 5006.39it/s]


Unnamed: 0,country,city,lat,lon,web,online
21087,CZ,Prague,50.1253371,14.4446896,http://www.sutka.cz/stale-menu/k-pivu-1,20230326020758
6405,CZ,Prague,49.9584775,14.3260737,http://www.nadjezem.cz/type/video/,20231221214008
734,CZ,Prague,50.0824881,14.444303,https://www.belzepub.cz/online-menu/section:me...,20231201085006
6926,CZ,Prague,50.0813644,14.4254601,https://www.hospodalucerna.cz/category/nezaraz...,20230129080824
41168,CZ,Prague,50.1054121,14.3954275,https://www.estellacafe.cz/index.php/1009-2/14...,20230607131939


Size 116872, Memory 13.8 MB
country           category
city              category
lat        string[pyarrow]
lon        string[pyarrow]
web        string[pyarrow]
online     string[pyarrow]
dtype: object


In [21]:
# drop excluded webpages
pattern = '|'.join([re.escape(page) for page in exclude_pages])
todrop = data.loc[data.web.str.contains(pattern,regex=True)]
print(len(todrop))
data = data.drop(todrop.index)

if len(data) == 0:
    raise UserWarning('No website pages found. Check REST API service or change options.')

frame_info(data)

679


Unnamed: 0,country,city,lat,lon,web,online
62794,CZ,Brno,49.2178324,16.5539929,https://andini.cz/cs?utm_source=foodinc.gsclou...,20230610073033
1777,CZ,Prague,50.0841232,14.449345,https://www.barberstreet.cz/ru/pansky-strih,20240518034229
30725,CZ,Prague,50.0793584,14.4271346,https://dream-hostels.com/khmelnytskyi/,20230205205232
41621,CZ,Prague,50.0918296,14.4528268,https://bimart.cz/Produkt/fjk-olej-olivovy-ext...,20230414014814
73647,CZ,Pilsen,49.7782128,13.3703559,http://tvsluzba.eu/,20230605053854


Size 116193, Memory 14.7 MB
country           category
city              category
lat        string[pyarrow]
lon        string[pyarrow]
web        string[pyarrow]
online     string[pyarrow]
dtype: object


In [22]:
# drop invalid websites with a few pages
data['webroot'] = data.web.progress_apply(lambda x: parse_url(x)['root'])

counts = data.webroot.value_counts()
webroots = set(counts.loc[counts <= 5].index)
todrop = data.loc[data.webroot.isin(webroots)]
data = data.drop(todrop.index)
#data.webroot.value_counts()
frame_info(data)

100%|███████████████████████████████████████████████████| 116193/116193 [00:00<00:00, 959372.51it/s]


Unnamed: 0,country,city,lat,lon,web,online,webroot
32262,CZ,Prague,50.0803207,14.4183208,http://antik-kant.cz/index.php/cs/sports-outdo...,20230323112718,antik-kant.cz
21658,CZ,Prague,50.1098012,14.5411443,http://www.athoz.cz/barmanskykurz/,20231204025442,athoz.cz
32144,CZ,Prague,50.0313967,14.5979964,https://mudrjitkahavlova.cz/.well-known/gpc.json,20240117052816,mudrjitkahavlova.cz
17020,CZ,Prague,50.0998998,14.4253366,https://merhautovo.cz/bochnik-novinek/bochnik-...,20230203100513,merhautovo.cz
79557,CZ,Olomouc,49.5973956,17.2401925,https://skolkausila.cz/op-jaki/,20231202213113,skolkausila.cz


Size 112765, Memory 21.4 MB
country           category
city              category
lat        string[pyarrow]
lon        string[pyarrow]
web        string[pyarrow]
online     string[pyarrow]
webroot             object
dtype: object


# Data store

In [23]:
# reorder columns
data = data[['web','lat','lon','city','country','online']]
data = data.sort_values(by='city')
data.to_parquet(f'geo-web-results/geo-web-{COUNTRY}.parquet',index=False)
frame_info(data)

Unnamed: 0,web,lat,lon,city,country,online
75983,https://www.real-escape-liberec.com/p/darkovy-...,50.7407339,15.0880697,Liberec,CZ,20230129050142
62069,https://pivokos.cz/wp-login.php?redirect_to=ht...,49.2084977,16.6025673,Brno,CZ,20230628135907
115306,https://www.sport-jicin.cz/gdpr,50.43939,15.3482222,Jičín,CZ,20230206173936
79012,https://cova.cz/cz/blog/test,49.5875936,17.2853575,Olomouc,CZ,20240226081436
46609,https://www.ms-albrechticka.cz/profil/katerina...,50.133654,14.5448869,Prague,CZ,20230930182645


Size 112765, Memory 14.3 MB
web        string[pyarrow]
lat        string[pyarrow]
lon        string[pyarrow]
city              category
country           category
online     string[pyarrow]
dtype: object
