## Step 4: Produce final dataset 

### Read GPX files into a geodataframe
* First, write response strings into .gpx files
* Then, read them one by one and append into a list
* Construct a new geodataframe from the `relevant_subset` + these gpx geometries

In [200]:
import pandas as pd

# For all things geospatial
import geopandas as gpd
import shapely
from shapely.geometry import LineString, MultiLineString
from geopy.distance import geodesic
import srtm # pip install git+https://github.com/tkrajina/srtm.py
import gpxpy

# To mask phone numbers
import phonenumbers

# For parallel_apply() in pandas
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# For translation
import argostranslate.package
import argostranslate.translate

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [201]:
relevant_subset = (
    pd.read_feather(
        './interim/relevant_subset_llm.feather'
    )
    .query('is_activity_description.eq("True")')
    .query('contains_personal_info.eq("False")')
)

In [202]:
relevant_subset.shape

(1483, 16)

### Get elevation data
* Where elevation data does not exist, use SRTM (see [SRTM.py](https://github.com/tkrajina/srtm.py) on GitHub)

In [204]:
relevant_subset.head(5)

Unnamed: 0,url,content_mime_type,content_mime_detected,warc_filename,warc_record_offset,warc_record_length,responses,gpx_metadata,length_2d,pts_per_100m,n_tracks,description,is_desc_long_enough,description_lang,is_activity_description,contains_personal_info
0,https://www.massalubrenseturismo.it/wp-content...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947475...,869644243,3465,"<?xml version=""1.0""?>\n<gpx xmlns=""http://www....",{'description': 'Cala di Mitigliano is the rig...,1669.0,10.183499,1.0,Cala di Mitigliano is the right place to go if...,True,en,True,False
3,https://www.weggis-vitznau.ch/de/detail/poi/16...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947473...,1107219602,4914,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",{'description': 'Der aussichtsreiche Aufstieg ...,6705.0,5.622297,1.0,Der aussichtsreiche Aufstieg von Weggis nach R...,True,de,True,False
4,https://www.weggis-vitznau.ch/en/detail/poi/16...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947473...,1125700907,4914,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",{'description': 'The ascent from Weggis to Rig...,6705.0,5.622297,1.0,The ascent from Weggis to Rigi Kaltbad passes ...,True,en,True,False
5,https://www.la-plagne.com/apidae/download/4790...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947476...,833356093,2361,"<?xml version=""1.0"" encoding=""UTF-8"" standalon...",{'description': 'Départ de la chapelle St-Grat...,3364.0,6.658653,1.0,Départ de la chapelle St-Grat. Prendre le sent...,True,fr,True,False
6,https://www.la-plagne.com/apidae/download/4794...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947476...,843108081,2724,"<?xml version=""1.0"" encoding=""UTF-8"" standalon...",{'description': 'Départ du parking de la Glièr...,2936.0,8.855443,1.0,Départ du parking de la Glière. Au pont traver...,True,fr,True,False


In [207]:
geoms = []
elev_profiles = []
length_3d = []

for i, response in relevant_subset.responses.items():

    if i % 25 == 0:
        print(f'Processing {i}...')
    
    track = gpxpy.parse(response)
    elev_source = 'GPS'

    # If elevation does not exist, use SRTM
    if track.get_uphill_downhill().uphill == 0 and track.get_uphill_downhill().downhill == 0:
        elevation_data.add_elevations(track, smooth=True) # Cached, so will work fast 2nd/3rd/... time
        elev_source = 'DEM'

    length_3d.append( track.length_3d() )
    
    path = f'./gpx/{i}.gpx'
    
    with open(path, 'w') as f:
        f.write(track.to_xml())

    try:
        g = gpd.read_file(path, layer='tracks').iloc[0].geometry
        g = MultiLineString(g) if g.geom_type == 'LineString' else g
        elevations = gpd.read_file(path, layer='track_points').ele.tolist()

        # Add elevations to each point
        lines = []
        for line in g.geoms:
            lines.append( [ (x, y, elevations.pop(0)) for x, y in line.coords] )

        # Recreate a MultiLineString Z (with elevations)
        geoms.append(MultiLineString(lines))

        # Add elevation profile
        elev_profiles.append({
            'highest': track.get_elevation_extremes().maximum,
            'lowest': track.get_elevation_extremes().minimum,
            'uphill': track.get_uphill_downhill().uphill,
            'downhill': track.get_uphill_downhill().downhill,
            'source': elev_source
        })
        
    except:
        geoms.append(None)
        elev_profiles.append({})

Processing 0...
Processing 25...


ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 9, column 13


Processing 50...
Processing 100...
Processing 125...
Processing 150...
Processing 175...
Processing 200...
Processing 225...
Processing 250...
Processing 575...
Processing 600...
Processing 625...
Processing 650...
Processing 675...


ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13


Processing 725...


ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 10, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 43, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 43, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 43, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-forme

Processing 775...
Processing 800...
Processing 825...
Processing 950...
Processing 975...
Processing 1000...


ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 9, column 13


Processing 1025...
4 2884802
Processing 1075...
Processing 1350...
Processing 1375...
Processing 1400...
Processing 1425...
Processing 1450...
Processing 1475...
Processing 1500...
Processing 1525...
Processing 1550...
Processing 1575...
Processing 1600...
Processing 1625...
Processing 1675...


ERROR:fiona._env:`./gpx/1712.gpx' not recognized as a supported file format.


Processing 1725...
Processing 1750...
Processing 1775...
Processing 2100...
Processing 2125...
4 2884802
Processing 2150...
Processing 2200...
Processing 2225...
Processing 2250...
Processing 2525...
Processing 2550...
Processing 2575...
Processing 2600...
Processing 2625...
Processing 2650...


ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 2545, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 2545, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 2545, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 2545, column 13
ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 2545, column 13


Processing 2725...
Processing 2750...
Processing 2775...
4 2884802


ERROR:fiona._env:XML parsing of GPX file failed : not well-formed (invalid token) at line 11, column 13
ERROR:fiona._env:`./gpx/2903.gpx' not recognized as a supported file format.


Processing 2900...
Processing 2925...
Processing 2950...
Processing 2975...
Processing 3025...
Processing 3075...


In [208]:
assert len(elev_profiles) == len(geoms) == relevant_subset.shape[0]

In [213]:
final = (
    gpd.GeoDataFrame(
        relevant_subset, 
        geometry=geoms,
        crs=4326
    )
    .assign(
        elev_highest=[ x.get('highest', None) for x in elev_profiles ],
        elev_lowest=[ x.get('lowest', None) for x in elev_profiles ],
        uphill=[ x.get('uphill', None) for x in elev_profiles ],
        downhill=[ x.get('downhill', None) for x in elev_profiles ],
        elev_source= [ x.get('source', None) for x in elev_profiles ],
        length_3d=[round(x) for x in length_3d]
    )
    .dropna(subset=['geometry'])
)

# Convert 3-dimensional length from float to int
final.length_2d = final.length_2d.astype(int)

In [215]:
final

Unnamed: 0,url,content_mime_type,content_mime_detected,warc_filename,warc_record_offset,warc_record_length,responses,gpx_metadata,length_2d,pts_per_100m,...,description_lang,is_activity_description,contains_personal_info,geometry,elev_highest,elev_lowest,uphill,downhill,elev_source,length_3d
0,https://www.massalubrenseturismo.it/wp-content...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947475...,869644243,3465,"<?xml version=""1.0""?>\n<gpx xmlns=""http://www....",{'description': 'Cala di Mitigliano is the rig...,1669,10.183499,...,en,True,False,MULTILINESTRING Z ((14.34008 40.59049 322.3100...,322.970000,12.560000,19.296000,329.046000,GPS,1728
3,https://www.weggis-vitznau.ch/de/detail/poi/16...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947473...,1107219602,4914,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",{'description': 'Der aussichtsreiche Aufstieg ...,6705,5.622297,...,de,True,False,MULTILINESTRING Z ((8.43320 47.03175 435.00000...,1435.000000,435.000000,1021.300000,21.300000,GPS,6844
4,https://www.weggis-vitznau.ch/en/detail/poi/16...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947473...,1125700907,4914,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...",{'description': 'The ascent from Weggis to Rig...,6705,5.622297,...,en,True,False,MULTILINESTRING Z ((8.43320 47.03175 435.00000...,1435.000000,435.000000,1021.300000,21.300000,GPS,6844
5,https://www.la-plagne.com/apidae/download/4790...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947476...,833356093,2361,"<?xml version=""1.0"" encoding=""UTF-8"" standalon...",{'description': 'Départ de la chapelle St-Grat...,3364,6.658653,...,fr,True,False,MULTILINESTRING Z ((6.57567 45.53249 1128.0000...,1138.000000,606.000000,299.000000,821.000000,DEM,3824
6,https://www.la-plagne.com/apidae/download/4794...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947476...,843108081,2724,"<?xml version=""1.0"" encoding=""UTF-8"" standalon...",{'description': 'Départ du parking de la Glièr...,2936,8.855443,...,fr,True,False,MULTILINESTRING Z ((6.60311 45.54558 783.00000...,1013.000000,783.000000,330.000000,330.000000,DEM,3202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3090,http://www.wandelenindoorwerth.nl/wandelroutes...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296945...,98938201,15569,"<?xml version=""1.0"" encoding=""UTF-8""?>\n\n<gpx...",{'description': 'Kasteel Doorwerth is uit 1300...,8209,2.606965,...,nl,True,False,"MULTILINESTRING Z ((5.78685 51.96664 13.00000,...",64.000000,9.253150,127.303846,127.303846,DEM,8220
3092,http://www.wandeleninoosterbeek.nl/wandelroute...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296950...,96350674,9971,"<?xml version=""1.0"" encoding=""UTF-8""?>\n\n<gpx...",{'description': 'Ook in Oosterbeek landden mas...,4336,2.744365,...,nl,True,False,"MULTILINESTRING Z ((5.81502 51.99027 65.00000,...",67.177470,38.914072,54.875503,54.875503,DEM,4339
3093,http://www.wandeleninoosterbeek.nl/wandelroute...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296950...,100561887,12332,"<?xml version=""1.0"" encoding=""UTF-8""?>\n\n<gpx...",{'description': 'Landgoed de Vijverberg heet n...,5318,2.876903,...,nl,True,False,"MULTILINESTRING Z ((5.85752 52.00328 65.00000,...",70.807307,31.000000,61.585648,61.585648,DEM,5322
3094,https://www.datocms-assets.com/25489/162809011...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296945...,804035533,14262,"<?xml version=""1.0""?>\n<gpx xmlns=""http://www....",{'description': 'Tato trasa je zpracována do v...,65354,1.840743,...,cs,True,False,MULTILINESTRING Z ((16.67128 48.87455 246.4100...,282.010000,149.710000,974.810000,973.450000,GPS,65492


### Translate descriptions to English

In [261]:
# Only translate popular languages (5+ examples)
lang_counts = final.description_lang.value_counts()
relevant_langs = lang_counts[lang_counts.gt(5)].index.tolist()
relevant_langs.remove('hr') # Croatian does not exist in argos
relevant_langs

ValueError: list.remove(x): x not in list

In [217]:
# Download and install Argos Translate packages
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()

for lang in relevant_langs:
    if lang == 'en': # We don't want to translate from English
        continue

    try:
        package_to_install = next(
            filter(
                lambda x: x.from_code == lang and x.to_code == 'en', available_packages
            )
        )
        argostranslate.package.install_from_path(package_to_install.download())
    except:
        print(f'Could not install {lang}...')

In [218]:
%%time

def translate(text, lang_from, lang_to='en'):
    if lang_from == lang_to: # no need to translate en<->en
        return text

    if not lang_from in relevant_langs:
        return None
        
    return argostranslate.translate.translate(text, lang_from, lang_to)

final['description_en'] = final.parallel_apply(
    lambda row: translate(row['description'], row['description_lang']),
    axis=1
)

python(12733) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=180), Label(value='0 / 180'))), HB…

python(12734) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(12735) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(12736) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(12737) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(12738) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(12739) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(12740) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(12741) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


CPU times: user 229 ms, sys: 168 ms, total: 398 ms
Wall time: 9min 3s


In [219]:
final.dropna(subset=['description_en'], inplace=True)

### Mask phone numbers, URLs, and emails in descriptions 🎭

In [220]:
# Remove emails - for regex, see https://html.spec.whatwg.org/multipage/input.html#valid-e-mail-address
email_regex = r"[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*"

final.description = final.description.str.replace(email_regex, '<EMAIL>', regex=True)
final.description_en = final.description_en.str.replace(email_regex, '<EMAIL>', regex=True)

In [221]:
# Remove URLs - courtesy of https://stackoverflow.com/a/8234912/4361039
url_regex = r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+(:[0-9]+)?|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)'

final.description = final.description.str.replace(url_regex, '<URL>', regex=True)
final.description_en = final.description_en.str.replace(url_regex, '<URL>', regex=True)

In [222]:
def mask_telephone(s):
    s = str(s)
    for match in phonenumbers.PhoneNumberMatcher(s, None):
        s = s.replace(match.raw_string, '<TELEPHONE>')
    return s

final.description = final.description.apply(mask_telephone)
final.description_en = final.description_en.apply(mask_telephone)

### Add country name
* Use country boundaries from https://www.geoboundaries.org/globalDownloads.html
* Assign country based on the very first point in the linestring

In [223]:
countries_gdf = gpd.read_file(
    'https://github.com/wmgeolab/geoBoundaries/raw/main/releaseData/CGAZ/geoBoundariesCGAZ_ADM0.gpkg'
).rename(columns={'shapeName': 'country'}).to_crs(4326)

In [224]:
start_points = final.geometry.apply(
    lambda g: shapely.get_point(g.geoms[0], 0)
).to_frame()

end_points = final.geometry.apply(
    lambda g: shapely.get_point(g.geoms[-1], -1)
).to_frame()

In [225]:
countries = gpd.sjoin_nearest(
    start_points, countries_gdf, how='left'
).country

final.loc[:, 'country'] = countries




### Measure distances between start and end points to determine if routes are circular

In [234]:
distances = [ geodesic(p1, p2).m for p1, p2 in list(zip(
    start_points.geometry.apply(lambda p: p.coords[0][:2][::-1]),
    end_points.geometry.apply(lambda p: p.coords[0][:2][::-1])
)) ]

In [235]:
is_circular = [ True if d <= 300 else False for d in distances]

In [236]:
final['is_circular'] = is_circular

### 💾 Finally, save to geopackage (.gpkg) 

In [238]:
(final
 .to_file('final/final.full.gpkg')
)

In [253]:
(final
 .filter([
     'url',
     'warc_filename',
     'warc_record_offset',
     'warc_record_length',
     
     'country',
     'description',
     'description_lang',
     'description_en',

     'elev_source',
     'elev_highest',
     'elev_lowest',
     'uphill',
     'downhill',
     
     'length_2d',
     'length_3d',
     'is_circular',
     
     'geometry'
 ])
 .rename(columns={
     'description': 'desc',
     'description_lang': 'desc_lang',
     'description_en': 'desc_en',
     'warc_filename': 'warc_file',
     'warc_record_offset': 'warc_offset',
     'warc_record_length': 'warc_len',
 })
 .to_file('final/final.gpkg')
)

In [259]:
final.elev_source.eq('GPS').mean()

0.5388418079096046