In [1]:
#  Copyright 2022 Institute of Advanced Research in Artificial Intelligence (IARAI) GmbH.
#  IARAI licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License. You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

In [2]:
import os
import sys

In [3]:
# Alternatevly, in order to make the module imports work properly set PYTHONPATH=$PWD before launching the notebook server from the repo root folder.
sys.path.insert(0, os.path.abspath("../"))  # noqa:E402

![t4c20logo](../t4c20logo.png)

In [4]:
import logging
import t4c22
from pathlib import Path
from typing import Tuple
from IPython.core.display import HTML
from IPython.display import display

from t4c22.t4c22_config import load_basedir, df_filter_weekdays_daytime_only, load_cc_labels

In [5]:
%matplotlib inline
%load_ext autoreload
%load_ext time
%autoreload 2
%autosave 60

display(HTML("<style>.container { width:80% !important; }</style>"))

The time module is not an IPython extension.


Autosaving every 60 seconds


In [6]:
# load BASEDIRÂ from file, change to your data root
BASEDIR = load_basedir(fn="t4c22_config.json", pkg=t4c22)

In [7]:
def run_cc_distribution(basedir: Path, city: str, split="train", filter=df_filter_weekdays_daytime_only) -> Tuple[dict, dict]:
    """Get absolute and relative number of congestion classes for the given
    city."""
    df = load_cc_labels(basedir, split=split, city=city, df_filter=filter)
    logging.info(f"len(df) from file: {len(df)}")
    df = filter(df)
    logging.info(f"len(df) from file after filtering: {len(df)}")
    nums = {c: len(df[df["cc"] == i]) for c, i in [("green", 1), ("yellow", 2), ("red", 3)]}
    ratios = {c: l / len(df) for c, l in nums.items()}
    return nums, ratios

## Get distribution of red, yellow, green from the data to get the class weights

In [8]:
nums, ratios = run_cc_distribution(city="london", split="train", basedir=BASEDIR)
nums, ratios

({'green': 155232062, 'yellow': 101614181, 'red': 32339248},
 {'green': 0.5367906303432076,
  'yellow': 0.35138063340805714,
  'red': 0.11182873624873524})

In [9]:
nums, ratios = run_cc_distribution(city="madrid", split="train", basedir=BASEDIR)
nums, ratios

({'green': 182071624, 'yellow': 140118360, 'red': 43693329},
 {'green': 0.4976221039083026,
  'yellow': 0.3829591430424158,
  'red': 0.1194187530492816})

In [10]:
nums, ratios = run_cc_distribution(city="melbourne", split="train", basedir=BASEDIR)
nums, ratios

({'green': 61268426, 'yellow': 19406770, 'red': 6615065},
 {'green': 0.7018930324884697,
  'yellow': 0.2223245729555099,
  'red': 0.0757823945560204})