## Step 2: Combine releases into a single feather file

* Read all Common Crawl releases into the same dataframe, and deduplicate based on URL and file contents

In [15]:
import os
import pandas as pd

# To parse GPX strings
import gpxpy

# For parallel_apply() in pandas
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# To parse descriptions
from bs4 import BeautifulSoup

# To identify text language
import pycld2 as cld2 # pip install git+https://github.com/tmikus/pycld2.git

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### Quick stats

In [16]:
dir = './interim/'

urls = (
    pd.concat([
        pd.read_feather(f'{dir}{r}') for r in os.listdir(dir) \
            if 'CC-MAIN-' in r and r.endswith('feather')
    ])
)

In [32]:
print('Total files:', urls.shape[0])
print('Downloaded files:', (~urls.responses.isna()).sum())
print('Downloaded files %:', round((~urls.responses.isna()).sum() / urls.shape[0] * 100, 1))
print('Unique URLs among downloaded:', urls.query('~responses.isna()').url.nunique())
print('Unique URLs among downloaded %:', round(urls.query('~responses.isna()').url.nunique() / (~urls.responses.isna()).sum() * 100, 1))

Total files: 112953
Downloaded files: 111102
Downloaded files %: 98.4
Unique URLs among downloaded: 102103
Unique URLs among downloaded %: 91.9


### Combine releases & deduplicate

In [33]:
dir = './interim/'

urls = (
    pd.concat([
        pd.read_feather(f'{dir}{r}') for r in os.listdir(dir) \
            if 'CC-MAIN-' in r and r.endswith('feather')
    ])
    .drop_duplicates(subset=['responses'])
    .drop_duplicates(subset=['url'])
    .reset_index()
    .drop(columns=['index'])
)

In [34]:
print('Number of files after deduplication:', urls.shape[0])

Number of files after deduplication: 94170


### Convert respones to GPX objects, and extract metadata

In [4]:
def get_gpx_metadata(s):
    try:
        gpx = gpxpy.parse(s)
        md = gpx.get_moving_data()

        # How many points are there in the first track?
        pts_in_first_track = sum(
            [ len(segment.points) for track in gpx.tracks[:1] for segment in track.segments ]
        )

        # What is the average number of points per 100 m in first track?
        pts_per_100m = pts_in_first_track / gpx.tracks[0].length_2d() * 100
        
        return {
            'n_tracks': len(gpx.tracks),
            'pts_per_100m': pts_per_100m,
            'description': gpx.description,
            'length_2d': round(gpx.length_2d())
        }
    except:
        return None

In [5]:
%%time
urls['gpx_metadata'] = urls.responses.parallel_apply(get_gpx_metadata)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11772), Label(value='0 / 11772')))…

CPU times: user 4.67 s, sys: 12.6 s, total: 17.2 s
Wall time: 8min 6s


In [6]:
urls['length_2d'] = urls.gpx_metadata.str['length_2d']
urls['pts_per_100m'] = urls.gpx_metadata.str['pts_per_100m']
urls['n_tracks'] = urls.gpx_metadata.str['n_tracks']

### Clean up the descriptions

In [7]:
%%capture

def clean_description(desc):
    """
    Returns a clean text description with HTML tags removed
    """
    try:
        desc = BeautifulSoup(desc).get_text()
        desc = desc.replace('\n', ' ').replace('\t', ' ').replace('\xa0', ' ')
        desc = desc.replace("l\'", ' ')
        desc = ' '.join(desc.strip().split())
        return desc
    except:
        return None

urls['description'] = urls.gpx_metadata.str['description'].apply(clean_description)

# Remove contents in square brackets using regex
urls.description = urls.description.str.replace(r'\[.*?\]', '', regex=True)

# Remove contents in curly brackets using regex
urls.description = urls.description.str.replace(r'\{.*?\}', '', regex=True)

# Strip trailing spaces
urls.description = urls.description.str.strip()

### Establish relevant description lengths

In [8]:
# We want descriptions of over 50 characters
# and ignore those that are too long (over 99th percentile ~ 2000 characters)
print(
    'Length at 99.0, 99.1, 99.2 percentiles: ',
    round(urls.description.str.len().quantile(0.990)),
    round(urls.description.str.len().quantile(0.991)),
    round(urls.description.str.len().quantile(0.992))
)

cap_value = 2000
urls['is_desc_long_enough'] = (
    urls.description.str.len().ge(50)
    & urls.description.str.len().le(cap_value)
)

Length at 99.0, 99.1, 99.2 percentiles:  1915 2074 2283


### Identify source description language
* We want to exclude tracks whose description language is Unknown by `pycld2` (these are typically low quality)

In [9]:
def detect_language(s):
    try:
        _, _, lang = cld2.detect(str(s))
        return lang[0][1]
    except:
        return 'un' # Unknown language

urls['description_lang'] = 'un' # Set all to Unknown first

# For descriptions of appropriate length, guess language
idx = urls.is_desc_long_enough
urls.loc[idx , 'description_lang'] = urls.loc[idx, 'description'].apply(detect_language)

In [10]:
urls.description_lang.value_counts()

un         89808
de          1910
fr          1651
en           334
uk           144
it            85
ru            48
cs            32
sl            29
sk            26
nl            17
hu            16
es            14
pl            13
hr             8
jw             6
ro             6
sv             5
nn             4
zh-Hant        2
bg             2
war            2
gl             2
da             1
no             1
rm             1
ca             1
el             1
co             1
Name: description_lang, dtype: int64

### Keep only relevant GPX files, and save to `interim/relevant_subset.feather`
* We want tracks with descriptions between 50-2000 characters
* With only 1 track
* Length is between 0.5km - 100km
* Description language is not *Unknown*
* At least 1 GPS point per 100 metres

In [11]:
relevant_subset = (
    urls
    .query('is_desc_long_enough')
    .query('n_tracks.eq(1)')
    .query('pts_per_100m.ge(1.0)')
    .query('length_2d.ge(500) & length_2d.le(100000)')
    .query('length_2d.ge(500) & length_2d.le(100000)')
    .query('description_lang.ne("un")')
    .reset_index()
    .drop(columns=['index'])
)

In [12]:
relevant_subset

Unnamed: 0,url,content_mime_type,content_mime_detected,warc_filename,warc_record_offset,warc_record_length,responses,gpx_metadata,length_2d,pts_per_100m,n_tracks,description,is_desc_long_enough,description_lang
0,https://www.massalubrenseturismo.it/wp-content...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947475...,869644243,3465,"<?xml version=""1.0""?>\n<gpx xmlns=""http://www....","{'n_tracks': 1, 'pts_per_100m': 10.18349861760...",1669.0,10.183499,1.0,Cala di Mitigliano is the right place to go if...,True,en
1,https://www.tv-markdorf.de/leichtathletik/GB_V...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947475...,1022531609,4227,"<?xml version=""1.0""?>\r\n<gpx xmlns=""http://ww...","{'n_tracks': 1, 'pts_per_100m': 3.848005470158...",8914.0,3.848005,1.0,Laufstrecke Erwachsene (Hauptlauf und Walking)...,True,de
2,https://www.tv-markdorf.de/leichtathletik/GB_V...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947475...,1028648267,2124,"<?xml version=""1.0""?>\n<gpx xmlns=""http://www....","{'n_tracks': 1, 'pts_per_100m': 6.303227603769...",2332.0,6.303228,1.0,Wettkampfstrecke für den Markdorfer Gehrenberg...,True,de
3,https://www.weggis-vitznau.ch/de/detail/poi/16...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947473...,1107219602,4914,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...","{'n_tracks': 1, 'pts_per_100m': 5.622297100355...",6705.0,5.622297,1.0,Der aussichtsreiche Aufstieg von Weggis nach R...,True,de
4,https://www.weggis-vitznau.ch/en/detail/poi/16...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2024-10/segments/1707947473...,1125700907,4914,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<gpx x...","{'n_tracks': 1, 'pts_per_100m': 5.622297100355...",6705.0,5.622297,1.0,The ascent from Weggis to Rigi Kaltbad passes ...,True,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3091,http://www.wandelenindoorwerth.nl/wandelroutes...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296945...,98721736,19665,"<?xml version=""1.0"" encoding=""UTF-8""?>\n\n<gpx...","{'n_tracks': 1, 'pts_per_100m': 2.742253566249...",9700.0,2.742254,1.0,"In de gemeente Renkum zijn vele oude bomen, ee...",True,nl
3092,http://www.wandeleninoosterbeek.nl/wandelroute...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296950...,96350674,9971,"<?xml version=""1.0"" encoding=""UTF-8""?>\n\n<gpx...","{'n_tracks': 1, 'pts_per_100m': 2.744365127605...",4336.0,2.744365,1.0,Ook in Oosterbeek landden massaal parachutiste...,True,nl
3093,http://www.wandeleninoosterbeek.nl/wandelroute...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296950...,100561887,12332,"<?xml version=""1.0"" encoding=""UTF-8""?>\n\n<gpx...","{'n_tracks': 1, 'pts_per_100m': 2.876903412149...",5318.0,2.876903,1.0,Landgoed de Vijverberg heet niet voor niets Vi...,True,nl
3094,https://www.datocms-assets.com/25489/162809011...,application/gpx+xml,application/gpx+xml,crawl-data/CC-MAIN-2023-14/segments/1679296945...,804035533,14262,"<?xml version=""1.0""?>\n<gpx xmlns=""http://www....","{'n_tracks': 1, 'pts_per_100m': 1.840742658275...",65354.0,1.840743,1.0,"Tato trasa je zpracována do videa, odkaz níže....",True,cs


In [14]:
relevant_subset.to_feather('interim/relevant_subset.feather')