In [1]:
#  Copyright 2022 Institute of Advanced Research in Artificial Intelligence (IARAI) GmbH.
#  IARAI licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License. You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

In [2]:
import os
import sys

# Alternatevly, in order to make the module imports work properly set PYTHONPATH=$PWD before launching the notebook server from the repo root folder.
sys.path.insert(0, os.path.abspath("../"))  # noqa:E402

![t4c20logo](../t4c20logo.png)

This notebook explores the supersegment ETA values and provides a simple baseline to generate a submission.
The baseline logic uses the total volume of all inputs (loop counter values) in a 15 minute input frame to assign the frame to one of 10 classes (clustered intervals of volume). These 10 cluster classes can be seen as a signal of total traffic load in the city.
The ETAs per supersegment are aggregated (median) in these 10 classes. The resulting map can then be used to lookup the ETAs for the test data.

In [3]:
import pandas
import numpy as np

from pathlib import Path

import t4c22

from t4c22.misc.t4c22_logging import t4c_apply_basic_logging_config
from t4c22.t4c22_config import load_basedir

In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%autosave 60

Autosaving every 60 seconds


In [5]:
t4c_apply_basic_logging_config(loglevel="DEBUG")

In [6]:
# Load BASEDIR from file, change to your data root.
BASEDIR = load_basedir(fn="t4c22_config.json", pkg=t4c22)

# Use already generated snapshots of aggregated median ETAs.
USE_ETA_BASELINE_SNAPSHOTS = False

EXPERIMENT_NAME = 'exp_c10'
NUM_VOLUME_CLUSTERS = 10

In [7]:
def load_train_input(city):
    train_input_frames = []
    for train_input_file in sorted((BASEDIR / 'train' / city / 'input').glob('counters_*.parquet')):
        train_input_frames.append(pandas.read_parquet(train_input_file))
    print(f'Read {len(train_input_frames)} training input files for {city}')
    train_input = pandas.concat(train_input_frames)
    train_input['vol'] = np.array(train_input['volumes_1h'].to_numpy().tolist()).sum(axis=1)
    return train_input


def load_train_labels(city):
    train_label_frames = []
    for train_label_file in sorted((BASEDIR / 'train' / city / 'labels').glob('eta_labels_*.parquet')):
        train_label_frames.append(pandas.read_parquet(train_label_file))
    print(f'Read {len(train_label_frames)} training label files')
    train_labels = pandas.concat(train_label_frames)
    print(f'Labels loaded: {len(train_labels)}')
    return train_labels


def get_cluster_id(volume_clusters, vol):
    for id, lower_bound, upper_bound in volume_clusters:
        if vol >= lower_bound and vol < upper_bound:
            return id
    return -1


def get_cluster_ids(volume_clusters, df, group_fields):
    df_groups = df.groupby(group_fields).sum()[['vol']]
    df_groups = df_groups.reset_index()
    df_groups['cluster'] = [get_cluster_id(volume_clusters, vol) for vol in df_groups['vol']]
    return df_groups


def load_tests(city):
    test_input = pandas.read_parquet(BASEDIR / 'test' / city / 'input' / 'counters_test.parquet')
    test_input['vol'] = np.array(test_input['volumes_1h'].to_numpy().tolist()).sum(axis=1)
    return test_input


def find_volume_clusters(city):
    df = load_train_input(city)
    return compute_volume_clusters(df)


def compute_volume_clusters(df):
    df_groups = df.groupby(['day', 't']).sum()[['vol']]
    quants = list(df_groups.quantile(np.linspace(0, 1, NUM_VOLUME_CLUSTERS + 1))['vol'])
    quants[0] -= 1e5
    quants[-1] += 1e5
    clusters = []
    for i, l, h in zip(range(len(quants)-1), quants[:-1], quants[1:]):
        clusters.append((i, l, h))
    return clusters


# Derived values from loop counter volumes
STATIC_VOLUME_CLUSTERS = {
    1: {
        'london': [(0, -46599.0, 7806420.0)],
        'madrid': [(0, 263688.0, 12361861.0)],
        'melbourne': [(0, 35418.79999999999, 6885818.7)]
    },
    10: {
        'london': [
            (0, -46599.0, 932240.2999999999),
            (1, 932240.2999999999, 1554726.4),
            (2, 1554726.4, 2588921.9000000004),
            (3, 2588921.9000000004, 3921954.0),
            (4, 3921954.0, 4863117.5),
            (5, 4863117.5, 5771968.0),
            (6, 5771968.0, 6005322.7),
            (7, 6005322.7, 6232288.4),
            (8, 6232288.4, 6493853.5),
            (9, 6493853.5, 7806420.0)
        ],
        'madrid': [
            (0, 263688.0, 923316.4),
            (1, 923316.4, 1911206.0),
            (2, 1911206.0, 3344747.900000001),
            (3, 3344747.900000001, 4972842.2),
            (4, 4972842.2, 6422392.0),
            (5, 6422392.0, 7669201.8),
            (6, 7669201.8, 8975374.4),
            (7, 8975374.4, 9762011.8),
            (8, 9762011.8, 10563257.9),
            (9, 10563257.9, 12361861.0)
        ],
        'melbourne': [
            (0, 35418.79999999999, 344195.01666666666),
            (1, 344195.01666666666, 588418.9400000001),
            (2, 588418.9400000001, 1072442.9133333336),
            (3, 1072442.9133333336, 1817113.62),
            (4, 1817113.62, 2663038.05),
            (5, 2663038.05, 3292800.466666667),
            (6, 3292800.466666667, 3936777.729999998),
            (7, 3936777.729999998, 4662487.213333336),
            (8, 4662487.213333336, 5325993.843333334),
            (9, 5325993.843333334, 6885818.7)
        ]
    }
}

if NUM_VOLUME_CLUSTERS in STATIC_VOLUME_CLUSTERS:
    city_volume_clusters = STATIC_VOLUME_CLUSTERS[NUM_VOLUME_CLUSTERS]
else:
    print('Computing volume clusters:')
    city_volume_clusters = {
        'london': find_volume_clusters('london'),
        'madrid': find_volume_clusters('madrid'),
        'melbourne': find_volume_clusters('melbourne')
    }
print(city_volume_clusters)

{'london': [(0, -46599.0, 932240.2999999999), (1, 932240.2999999999, 1554726.4), (2, 1554726.4, 2588921.9000000004), (3, 2588921.9000000004, 3921954.0), (4, 3921954.0, 4863117.5), (5, 4863117.5, 5771968.0), (6, 5771968.0, 6005322.7), (7, 6005322.7, 6232288.4), (8, 6232288.4, 6493853.5), (9, 6493853.5, 7806420.0)], 'madrid': [(0, 263688.0, 923316.4), (1, 923316.4, 1911206.0), (2, 1911206.0, 3344747.900000001), (3, 3344747.900000001, 4972842.2), (4, 4972842.2, 6422392.0), (5, 6422392.0, 7669201.8), (6, 7669201.8, 8975374.4), (7, 8975374.4, 9762011.8), (8, 9762011.8, 10563257.9), (9, 10563257.9, 12361861.0)], 'melbourne': [(0, 35418.79999999999, 344195.01666666666), (1, 344195.01666666666, 588418.9400000001), (2, 588418.9400000001, 1072442.9133333336), (3, 1072442.9133333336, 1817113.62), (4, 1817113.62, 2663038.05), (5, 2663038.05, 3292800.466666667), (6, 3292800.466666667, 3936777.729999998), (7, 3936777.729999998, 4662487.213333336), (8, 4662487.213333336, 5325993.843333334), (9, 53259

In [8]:
def create_prediction(city):
    snapshot_file = BASEDIR / 'snapshots' / f'eta_volume_cluster_baseline_{EXPERIMENT_NAME}_{city}.parquet'
    if USE_ETA_BASELINE_SNAPSHOTS:
        median_etas_df = pandas.read_parquet(snapshot_file)
    else:
        train_inputs_df = load_train_input(city)
        print(f'Inputs: {len(train_inputs_df)}')
        cluster_dates_df = get_cluster_ids(city_volume_clusters[city], train_inputs_df, ['day', 't'])
        print(f'Inputs grouped: {len(cluster_dates_df)}')
        train_labels_df = load_train_labels(city)
        print(f'Labels: {len(train_labels_df)}')
        train_labels_df = train_labels_df.merge(cluster_dates_df, on=['day', 't'])
        print(f'Labels merged: {len(train_labels_df)}')
        print(f'Unique supersegments: {len(train_labels_df["identifier"].unique())}')
        train_labels_df = train_labels_df[['identifier', 'cluster', 'eta']]
        median_etas_df = train_labels_df.groupby(['identifier', 'cluster']).median('eta')
        median_etas_df = median_etas_df.reset_index()
        print(f'Median ETAs: {len(median_etas_df)}')
        snapshot_file.parent.mkdir(exist_ok=True, parents=True)
        median_etas_df.to_parquet(snapshot_file, compression='snappy')
        
    tests_df = get_cluster_ids(city_volume_clusters[city], load_tests(city), ['test_idx'])
    print(f'Test raw: {len(tests_df)}')
    tests_df = tests_df.merge(median_etas_df, on=['cluster'], how='left')
    print(f'Test ETAs: {len(tests_df)}')
    submission_folder = BASEDIR / 'submissions' / EXPERIMENT_NAME / city / 'labels'
    submission_folder.mkdir(exist_ok=True, parents=True)
    tests_df.to_parquet(submission_folder / 'eta_labels_test.parquet', compression='snappy')
    return tests_df[['identifier', 'eta', 'test_idx']]

In [9]:
create_prediction('london')

Read 110 training input files for london
Inputs: 36503376
Inputs grouped: 10544
Read 110 training label files
Labels loaded: 42366720
Labels: 42366720
Labels merged: 42302528
Unique supersegments: 4012
Median ETAs: 40120
Test raw: 100
Test ETAs: 401200


Unnamed: 0,identifier,eta,test_idx
0,102046107792,951.533407,0
1,1020461231977903,439.876163,0
2,1020461504500003,216.813861,0
3,1020461635822092,203.559885,0
4,1020461691133703,353.472554,0
...,...,...,...
401195,993016444773877783,473.290888,99
401196,993016446022727128,337.494930,99
401197,993016446225598610,551.574208,99
401198,993016446225784831,489.879028,99


In [10]:
create_prediction('madrid')

Read 109 training input files for madrid
Inputs: 38337732
Inputs grouped: 10448
Read 109 training label files
Labels loaded: 41531616
Labels: 41531616
Labels merged: 41468112
Unique supersegments: 3969
Median ETAs: 39690
Test raw: 100
Test ETAs: 396900


Unnamed: 0,identifier,eta,test_idx
0,10089024125531147,390.770165,0
1,10089024125552352,333.217795,0
2,10089024125934553,371.828689,0
3,10089024126066556,548.296091,0
4,10089024126412872,576.546697,0
...,...,...,...
396895,9898916525906850,463.547558,99
396896,9898916526412824,555.803452,99
396897,9898916527509101,475.162506,99
396898,9898916529803013,131.186546,99


In [11]:
create_prediction('melbourne')

Read 106 training input files for melbourne
Inputs: 39664688
Inputs grouped: 10160
Read 108 training label files
Labels loaded: 33654528
Labels: 33654528
Labels merged: 32979360
Unique supersegments: 3246
Median ETAs: 32460
Test raw: 100
Test ETAs: 324600


Unnamed: 0,identifier,eta,test_idx
0,1023561961661825888,95.428332,0
1,10235619627170602,138.613647,0
2,10235619629708589,116.605973,0
3,10235619629708590,158.550847,0
4,102356196342826772,101.447276,0
...,...,...,...
324595,96444759631694288769,166.700150,99
324596,96444759631694312194,192.888583,99
324597,9644475963292434014,71.329040,99
324598,964447596330266272,89.451022,99


In [12]:
! ls -l {BASEDIR}/submissions/{EXPERIMENT_NAME}/**/labels/

/tmp/t4c_2022_comp_data/submissions/exp_c10/london/labels/:
total 6568
-rw-r--r--  1 neun  staff  3151921 Sep  1 10:33 eta_labels_test.parquet

/tmp/t4c_2022_comp_data/submissions/exp_c10/madrid/labels/:
total 7208
-rw-r--r--  1 neun  staff  3463285 Sep  1 10:34 eta_labels_test.parquet

/tmp/t4c_2022_comp_data/submissions/exp_c10/melbourne/labels/:
total 7088
-rw-r--r--  1 neun  staff  2586338 Sep  1 10:35 eta_labels_test.parquet


In [13]:
%%bash -s {BASEDIR} {EXPERIMENT_NAME}

cd  $1/submissions/$2
echo "Preparing submission ZIP file for $PWD"

zip -r eta_volume_cluster_baseline_$2.zip london madrid melbourne

ls -l

Preparing submission ZIP file for /tmp/t4c_2022_comp_data/submissions/exp_c10
updating: london/ (stored 0%)
updating: london/labels/ (stored 0%)
updating: london/labels/eta_labels_test.parquet (deflated 65%)
updating: madrid/ (stored 0%)
updating: madrid/labels/ (stored 0%)
updating: madrid/labels/eta_labels_test.parquet (deflated 67%)
updating: melbourne/ (stored 0%)
updating: melbourne/labels/ (stored 0%)
updating: melbourne/labels/eta_labels_test.parquet (deflated 62%)
total 6288
-rw-r--r--  1 neun  staff  3215855 Sep  1 10:35 eta_volume_cluster_baseline_exp_c10.zip
drwxr-xr-x  3 neun  staff       96 Aug 31 18:31 london
drwxr-xr-x  3 neun  staff       96 Aug 31 18:32 madrid
drwxr-xr-x  3 neun  staff       96 Aug 31 18:33 melbourne
