# Preprocessing

## Imports

In [1]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4


In [2]:
import pandas as pd
import numpy as np
import os
import torch
import random
import cv2
from tqdm import tqdm
from matplotlib import pyplot as plt
import albumentations as album
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from sklearn.metrics import accuracy_score
import plotly.express as px
import torchmetrics
from torchmetrics import MeanAbsolutePercentageError
from glob import glob

%matplotlib inline

## Global Variables

In [3]:
COLAB = True

In [4]:
ROOT_DIR_PATH = os.path.abspath('..')

if COLAB:

  from google.colab import drive
  drive.mount('/content/drive')

  ROOT_DIR_PATH = os.path.abspath('drive/MyDrive/Spatial_Finance_Transport/minorRoads/')

AADT_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/aadt/')

PROCESSED_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/aadt/processed/')

NN_MODEL_PATH = os.path.join(ROOT_DIR_PATH, "models/nn_aadt_model.pth")

NORMALISE = True

DROP_UNNORMALISE = False

REMOVE_METADATA = False

CHOSEN_COUNT_SITES = [('Luton', 'M1/2557A', 'M1/2557B'), ('Hounslow', 'M4/2188A', 'M4/2188B'), ('Enfield', 'M25/5441A', 'M25/5441B'), 
                      ('Blackburn with Darwen', '30361033', '30361032'), ('Havering', 'M25/5790A', 'M25/5790B'), ('Trafford', 'M60/9083A', 'M60/9086B')]

Mounted at /content/drive


## General Functions

In [5]:
# Function used to normalsise the count data
def normalise(clean_report):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    for name in interger_cols:
        new_name = f"{name}_normalised"
        # for ever row in the report present the row's site id's mean volume
        mean = clean_report.groupby('site_id')[name].transform("mean")
        # normalise
        clean_report.loc[:, new_name] = clean_report[name] / mean
        # filter so we don't have rows with a small mean which causes a pole
    return clean_report[mean>1]

In [6]:
# Function used to normalsise the count data
def normalise_v2(clean_report):

    integer_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']

    # Create an empty dataframe to save transformation values
    df_transform = pd.DataFrame(index=['min', 'max'], columns=integer_cols)

    site_name = clean_report.iloc[0]['site_name']

    for col_name in integer_cols:
        new_col_name = f"{col_name}_normalised"
        if df_transform is not None:
            # Save transformation values
            min_val = clean_report[col_name].min()
            max_val = clean_report[col_name].max()
            df_transform.loc['min', col_name] = min_val
            df_transform.loc['max', col_name] = max_val
        else:
            min_val = clean_report[col_name].min()
            max_val = clean_report[col_name].max()
        # Perform min-max normalization
        clean_report.loc[:, new_col_name] = (clean_report[col_name] - min_val) / (max_val - min_val)

    # save normalisation values
    print("saving normalization values to: {}".format(PROCESSED_PATH+'transform_'+site_name.replace('/', '_')+'.csv'))
    df_transform.to_csv(PROCESSED_PATH+'transform_'+site_name.replace('/', '_')+'.csv')
    return clean_report

In [7]:
def drop_unnormalise(df):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    df = df.drop(columns=interger_cols, axis=1)
    return df

In [8]:
def drop_metadata(df):
    metadata = ['site_id', 'time_period_ending', 'time_interval', 'daily_count', 'report_date', 'timestamp']
    df = df.drop(columns=metadata, axis=1)
    return df

In [9]:
def load_aadt_data():

    pattern = os.path.join(AADT_PATH, 'aadt_*.csv')

    aadt_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
    print("AADT files: {}".format(aadt_file_paths))

    df_aadt_list = []

    for i in range(len(aadt_file_paths)):
        df_aadt = pd.read_csv(aadt_file_paths[i])
        df_aadt = df_aadt.dropna()
        df_aadt = df_aadt.loc[:, ~df_aadt.columns.str.contains('^Unnamed')]

        df_aadt['site_name'] = df_aadt['site_name'].astype(str)

        if NORMALISE:
            df_aadt = normalise_v2(df_aadt)

        if DROP_UNNORMALISE:
            df_aadt = drop_unnormalise(df_aadt)

        if REMOVE_METADATA:
            df_aadt = drop_metadata(df_aadt)


        df_aadt_list.append(df_aadt)

    return df_aadt_list

In [10]:
def load_motor_vehicle_data():

    pattern = os.path.join(AADT_PATH, 'all_motor_vehicles_*.csv')

    motor_vehicle_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
    print("LA motor vehicle files: {}".format(motor_vehicle_file_paths))

    df_motor_vehicle_list = []

    for i in range(len(motor_vehicle_file_paths)):
        df_motor_vehicle = pd.read_csv(motor_vehicle_file_paths[i])
        df_motor_vehicle = df_motor_vehicle.loc[:, ~df_motor_vehicle.columns.str.contains('^Unnamed')]

        df_motor_vehicle_list.append(df_motor_vehicle)

    return df_motor_vehicle_list

In [11]:
def merge_dfs_from_lists(df_aadt_list, df_motor_vehicle_list, la_and_count_sites):

    merged_aadt_df_list = []

    for aadt_df in df_aadt_list:
        
        for motor_vehicle_df in df_motor_vehicle_list:

            for site in la_and_count_sites:

                (la_name, site_a, site_b) = site

                if la_name == motor_vehicle_df.iloc[0]['Local Authority'] and ( (site_a == aadt_df.iloc[0]['site_name']) or (site_b == aadt_df.iloc[0]['site_name']) ):

                    print("Entered if statement: {} {} {}".format(la_name, site_a, site_b))

                    year = aadt_df.iloc[0]['year']

                    all_motor_vehicles = motor_vehicle_df.loc[motor_vehicle_df['year'] == year].iloc[0]['all_motor_vehicles']
                    cars_and_taxis = motor_vehicle_df.loc[motor_vehicle_df['year'] == year].iloc[0]['cars_and_taxis']
                    buses_and_coaches = motor_vehicle_df.loc[motor_vehicle_df['year'] == year].iloc[0]['buses_and_coaches']
                    lgvs = motor_vehicle_df.loc[motor_vehicle_df['year'] == year].iloc[0]['lgvs']
                    all_hgvs = motor_vehicle_df.loc[motor_vehicle_df['year'] == year].iloc[0]['all_hgvs']

                    print("year: {}, all_motor_vehicles: {}".format(year, all_motor_vehicles))

                    merged_aadt_df = aadt_df.copy()

                    merged_aadt_df.name = 'aadt_'+la_name+'_'+aadt_df.iloc[0]['site_name'].replace('/', '_')+'_'+str(year)

                    merged_aadt_df['cars_and_taxis'] = cars_and_taxis
                    merged_aadt_df['buses_and_coaches'] = buses_and_coaches
                    merged_aadt_df['lgvs'] = lgvs
                    merged_aadt_df['all_hgvs'] = all_hgvs
                    merged_aadt_df['all_motor_vehicles'] = all_motor_vehicles

                    merged_aadt_df['Local Authority'] = motor_vehicle_df['Local Authority']
                    merged_aadt_df['site_name'] = merged_aadt_df['site_name'].astype(str)

                    merged_aadt_df_list.append(merged_aadt_df)

    return merged_aadt_df_list

## Data

In [12]:
df_aadt_list = load_aadt_data()

df_aadt_list[0].head()

AADT files: ['/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_30361032_year_2017.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_30361032_year_2018.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_30361033_year_2017.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_30361033_year_2018.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_M25_5790A_year_2017.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_M25_5790A_year_2018.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_M25_5790B_year_2017.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/aadt_M25_5790B_year_2018.csv', '/content/drive/MyDrive/Spatial

Unnamed: 0,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,...,month,day,hour,daily_count,aadt,0-520cm_normalised,521-660cm_normalised,661-1160cm_normalised,1160+cm_normalised,total_volume_normalised
0,30361032,6948,2017-01-01,00:14:00,0,45,3,0,0,61.0,...,1,1,0,13117,29898.296178,0.039929,0.031579,0.0,0.0,0.03754
1,30361032,6948,2017-01-01,00:29:00,1,73,2,1,0,63.0,...,1,1,0,13117,29898.296178,0.064774,0.021053,0.011765,0.0,0.059904
2,30361032,6948,2017-01-01,00:44:00,2,88,0,1,0,62.0,...,1,1,0,13117,29898.296178,0.078083,0.0,0.011765,0.0,0.070288
3,30361032,6948,2017-01-01,00:59:00,3,102,0,1,0,62.0,...,1,1,0,13117,29898.296178,0.090506,0.0,0.011765,0.0,0.08147
4,30361032,6948,2017-01-01,01:14:00,4,85,0,1,0,61.0,...,1,1,1,13117,29898.296178,0.075421,0.0,0.011765,0.0,0.067891


In [13]:
df_motor_vehicle_list = load_motor_vehicle_data()

df_motor_vehicle_list[0].head()

LA motor vehicle files: ['/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/all_motor_vehicles_Luton.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/all_motor_vehicles_Hounslow.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/all_motor_vehicles_Enfield.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/all_motor_vehicles_Trafford.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/all_motor_vehicles_Havering.csv', '/content/drive/MyDrive/Spatial_Finance_Transport/minorRoads/data/ground_truth_data/aadt/all_motor_vehicles_Blackburn with Darwen.csv']


Unnamed: 0,year,cars_and_taxis,buses_and_coaches,lgvs,all_hgvs,all_motor_vehicles,Local Authority
0,2005,4117.0,58.0,376.0,49.0,4602.0,Luton
1,2006,4079.0,57.0,379.0,56.0,4518.0,Luton
2,2007,2940.0,71.0,383.0,63.0,3499.0,Luton
3,2008,847.0,7.0,85.0,22.0,988.0,Luton
4,2009,2414.0,36.0,245.5,25.5,2836.0,Luton


## Standardisation

## Merge LA data

In [14]:
merged_aadt_df_list = merge_dfs_from_lists(df_aadt_list, df_motor_vehicle_list, CHOSEN_COUNT_SITES)

merged_aadt_df_list[0].head()

Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2017, all_motor_vehicles: 509.0
Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2018, all_motor_vehicles: 723.5
Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2017, all_motor_vehicles: 509.0
Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2018, all_motor_vehicles: 723.5
Entered if statement: Havering M25/5790A M25/5790B
year: 2017, all_motor_vehicles: 1515.0
Entered if statement: Havering M25/5790A M25/5790B
year: 2018, all_motor_vehicles: 3519.5
Entered if statement: Havering M25/5790A M25/5790B
year: 2017, all_motor_vehicles: 1515.0
Entered if statement: Havering M25/5790A M25/5790B
year: 2018, all_motor_vehicles: 3519.5
Entered if statement: Luton M1/2557A M1/2557B
year: 2017, all_motor_vehicles: 331.0
Entered if statement: Luton M1/2557A M1/2557B
year: 2018, all_motor_vehicles: 437.0
Entered if statement: Luton M1/2557A M1/2557B
year: 2017, all_motor_vehi

Unnamed: 0,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,...,521-660cm_normalised,661-1160cm_normalised,1160+cm_normalised,total_volume_normalised,cars_and_taxis,buses_and_coaches,lgvs,all_hgvs,all_motor_vehicles,Local Authority
0,30361032,6948,2017-01-01,00:14:00,0,45,3,0,0,61.0,...,0.031579,0.0,0.0,0.03754,460.0,0.0,45.0,2.0,509.0,Blackburn with Darwen
1,30361032,6948,2017-01-01,00:29:00,1,73,2,1,0,63.0,...,0.021053,0.011765,0.0,0.059904,460.0,0.0,45.0,2.0,509.0,Blackburn with Darwen
2,30361032,6948,2017-01-01,00:44:00,2,88,0,1,0,62.0,...,0.0,0.011765,0.0,0.070288,460.0,0.0,45.0,2.0,509.0,Blackburn with Darwen
3,30361032,6948,2017-01-01,00:59:00,3,102,0,1,0,62.0,...,0.0,0.011765,0.0,0.08147,460.0,0.0,45.0,2.0,509.0,Blackburn with Darwen
4,30361032,6948,2017-01-01,01:14:00,4,85,0,1,0,61.0,...,0.0,0.011765,0.0,0.067891,460.0,0.0,45.0,2.0,509.0,Blackburn with Darwen


## Saving data

In [15]:
print("Number of sites with motor vehicles merged: {}".format(len(merged_aadt_df_list)))

for merged_df in merged_aadt_df_list:
    print("merged df length: {}".format(len(merged_df)))
    merged_df.to_csv(PROCESSED_PATH+merged_df.name+'.csv')

Number of sites with motor vehicles merged: 20
merged df length: 16034
merged df length: 16625
merged df length: 16516
merged df length: 16872
merged df length: 16947
merged df length: 16832
merged df length: 17041
merged df length: 16816
merged df length: 16594
merged df length: 17192
merged df length: 16602
merged df length: 17033
merged df length: 16368
merged df length: 17226
merged df length: 17035
merged df length: 16839
merged df length: 16687
merged df length: 16819
merged df length: 16765
merged df length: 17050
