# Preprocessing

## Imports

In [28]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import torch
import random
import cv2
from tqdm import tqdm
from matplotlib import pyplot as plt
import segmentation_models_pytorch as smp
import albumentations as album
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from sklearn.metrics import accuracy_score
import plotly.express as px
import torchmetrics
from torchmetrics import MeanAbsolutePercentageError
from glob import glob

%matplotlib inline

## Global Variables

In [29]:
ROOT_DIR_PATH = os.path.abspath('..')

AADT_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/')

PROCESSED_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/processed/')

NN_MODEL_PATH = os.path.join(ROOT_DIR_PATH, "models/nn_aadt_model.pth")

NORMALISE = True

DROP_UNNORMALISE = False

REMOVE_METADATA = False

CHOSEN_COUNT_SITES = [('Luton', 'M1/2557A', 'M1/2557B'), ('Hounslow', 'M4/2188A', 'M4/2188B'), ('Enfield', 'M25/5441A', 'M25/5441B'), 
                      ('Blackburn with Darwen', '30361033', '30361032'), ('Havering', 'M25/5790A', 'M25/5790B')]

## General Functions

In [30]:
# Function used to normalsise the count data
def normalise(clean_report):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    for name in interger_cols:
        new_name = f"{name}_normalised"
        # for ever row in the report present the row's site id's mean volume
        mean = clean_report.groupby('site_id')[name].transform("mean")
        # normalise
        clean_report.loc[:, new_name] = clean_report[name] / mean
        # filter so we don't have rows with a small mean which causes a pole
    return clean_report[mean>1]

In [31]:
# Function used to normalsise the count data
def normalise_v2(clean_report):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']

    #site_name = clean_report.iloc[0]['site_name']

    dict_means = {'site_name': clean_report.name}
    for name in interger_cols:
        new_name = f"{name}_normalised"
        # for ever row in the report present the row's site id's mean volume
        mean = clean_report.groupby('site_id')[name].transform("mean")

        print("normalising {} {}, mean: {}".format(clean_report.name, name, mean))
        dict_means['name'] = mean
        # normalise
        clean_report.loc[:, new_name] = clean_report[name] / mean
        # filter so we don't have rows with a small mean which causes a pole

    file_path = PROCESSED_PATH+'mean_'+clean_report.name+'.csv'
    df_mean = pd.DataFrame(dict_means)

    print("saving mean to: {}".format(file_path))
    df_mean.to_csv(file_path)
    return clean_report[mean>1]

In [32]:
def drop_unnormalise(df):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    df = df.drop(columns=interger_cols, axis=1)
    return df

In [33]:
def drop_metadata(df):
    metadata = ['site_id', 'time_period_ending', 'time_interval', 'daily_count', 'report_date', 'timestamp']
    df = df.drop(columns=metadata, axis=1)
    return df

In [34]:
def load_aadt_data():

    pattern = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/aadt_*.csv')

    aadt_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
    print("AADT files: {}".format(aadt_file_paths))

    df_aadt_list = []

    for i in range(len(aadt_file_paths)):
        df_aadt = pd.read_csv(aadt_file_paths[i])
        df_aadt = df_aadt.dropna()
        df_aadt = df_aadt.loc[:, ~df_aadt.columns.str.contains('^Unnamed')]

        df_aadt['site_name'] = df_aadt['site_name'].astype(str)

        if NORMALISE:
            df_aadt = normalise(df_aadt)

        if DROP_UNNORMALISE:
            df_aadt = drop_unnormalise(df_aadt)

        if REMOVE_METADATA:
            df_aadt = drop_metadata(df_aadt)


        df_aadt_list.append(df_aadt)

    return df_aadt_list

In [35]:
def load_motor_vehicle_data():

    pattern = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/all_motor_vehicles_*.csv')

    motor_vehicle_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
    print("LA motor vehicle files: {}".format(motor_vehicle_file_paths))

    df_motor_vehicle_list = []

    for i in range(len(motor_vehicle_file_paths)):
        df_motor_vehicle = pd.read_csv(motor_vehicle_file_paths[i])
        df_motor_vehicle = df_motor_vehicle.loc[:, ~df_motor_vehicle.columns.str.contains('^Unnamed')]

        df_motor_vehicle_list.append(df_motor_vehicle)

    return df_motor_vehicle_list

In [36]:
def merge_dfs_from_lists(df_aadt_list, df_motor_vehicle_list, la_and_count_sites):

    merged_aadt_df_list = []

    for aadt_df in df_aadt_list:
        
        for motor_vehicle_df in df_motor_vehicle_list:

            for site in la_and_count_sites:

                (la_name, site_a, site_b) = site

                if la_name == motor_vehicle_df.iloc[0]['Local Authority'] and ( (site_a == aadt_df.iloc[0]['site_name']) or (site_b == aadt_df.iloc[0]['site_name']) ):

                    print("Entered if statement: {} {} {}".format(la_name, site_a, site_b))

                    year = aadt_df.iloc[0]['year']

                    all_motor_vehicles = motor_vehicle_df.loc[motor_vehicle_df['year'] == year].iloc[0]['all_motor_vehicles']

                    print("year: {}, all_motor_vehicles: {}".format(year, all_motor_vehicles))

                    merged_aadt_df = aadt_df.copy()

                    merged_aadt_df.name = 'aadt_'+la_name+'_'+aadt_df.iloc[0]['site_name'].replace('/', '_')+'_'+str(year)

                    merged_aadt_df['all_motor_vehicles'] = all_motor_vehicles
                    merged_aadt_df['Local Authority'] = motor_vehicle_df['Local Authority']
                    merged_aadt_df['site_name'] = merged_aadt_df['site_name'].astype(str)

                    merged_aadt_df_list.append(merged_aadt_df)

    return merged_aadt_df_list

## Data

In [37]:
df_aadt_list = load_aadt_data()

df_aadt_list[0].head()

AADT files: ['/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_30361032_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_30361033_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_30361032_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_30361033_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2557A_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M4_2188B_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M4_2188A_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2557B_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M25_5790B_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2557A_year_2018.csv', '/home/ah2719/FY

Unnamed: 0,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,...,month,day,hour,daily_count,aadt,0-520cm_normalised,521-660cm_normalised,661-1160cm_normalised,1160+cm_normalised,total_volume_normalised
0,30361032,6948,2017-01-01,00:14:00,0,45,3,0,0,61,...,1,1,0,13117,29898.296178,0.190743,0.263604,0.0,0.0,0.181487
1,30361032,6948,2017-01-01,00:29:00,1,73,2,1,0,63,...,1,1,0,13117,29898.296178,0.309427,0.175736,0.103396,0.0,0.287354
2,30361032,6948,2017-01-01,00:44:00,2,88,0,1,0,62,...,1,1,0,13117,29898.296178,0.373008,0.0,0.103396,0.0,0.336507
3,30361032,6948,2017-01-01,00:59:00,3,102,0,1,0,62,...,1,1,0,13117,29898.296178,0.43235,0.0,0.103396,0.0,0.38944
4,30361032,6948,2017-01-01,01:14:00,4,85,0,1,0,61,...,1,1,1,13117,29898.296178,0.360292,0.0,0.103396,0.0,0.325164


In [38]:
df_motor_vehicle_list = load_motor_vehicle_data()

df_motor_vehicle_list[0].head()

LA motor vehicle files: ['/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Blackburn with Darwen.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Enfield.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Hounslow.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Luton.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Havering.csv']


Unnamed: 0,year,all_motor_vehicles,Local Authority
0,2005,30241.0,Blackburn with Darwen
1,2006,31125.5,Blackburn with Darwen
2,2007,32970.5,Blackburn with Darwen
3,2008,32455.5,Blackburn with Darwen
4,2009,30340.5,Blackburn with Darwen


## Standardisation

## Merge LA data

In [39]:
merged_aadt_df_list = merge_dfs_from_lists(df_aadt_list, df_motor_vehicle_list, CHOSEN_COUNT_SITES)

merged_aadt_df_list[0].head()

Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2017, all_motor_vehicles: 30777.0
Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2018, all_motor_vehicles: 35333.0
Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2018, all_motor_vehicles: 35333.0
Entered if statement: Blackburn with Darwen 30361033 30361032
year: 2017, all_motor_vehicles: 30777.0
Entered if statement: Luton M1/2557A M1/2557B
year: 2017, all_motor_vehicles: 67952.0
Entered if statement: Hounslow M4/2188A M4/2188B
year: 2017, all_motor_vehicles: 51913.5
Entered if statement: Hounslow M4/2188A M4/2188B
year: 2018, all_motor_vehicles: 51047.5
Entered if statement: Luton M1/2557A M1/2557B
year: 2017, all_motor_vehicles: 67952.0
Entered if statement: Havering M25/5790A M25/5790B
year: 2017, all_motor_vehicles: 65754.0
Entered if statement: Luton M1/2557A M1/2557B
year: 2018, all_motor_vehicles: 70355.0
Entered if statement: Havering M25/5790A M25/5790B
year: 2018, al

Unnamed: 0,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,...,hour,daily_count,aadt,0-520cm_normalised,521-660cm_normalised,661-1160cm_normalised,1160+cm_normalised,total_volume_normalised,all_motor_vehicles,Local Authority
0,30361032,6948,2017-01-01,00:14:00,0,45,3,0,0,61,...,0,13117,29898.296178,0.190743,0.263604,0.0,0.0,0.181487,30777.0,Blackburn with Darwen
1,30361032,6948,2017-01-01,00:29:00,1,73,2,1,0,63,...,0,13117,29898.296178,0.309427,0.175736,0.103396,0.0,0.287354,30777.0,Blackburn with Darwen
2,30361032,6948,2017-01-01,00:44:00,2,88,0,1,0,62,...,0,13117,29898.296178,0.373008,0.0,0.103396,0.0,0.336507,30777.0,Blackburn with Darwen
3,30361032,6948,2017-01-01,00:59:00,3,102,0,1,0,62,...,0,13117,29898.296178,0.43235,0.0,0.103396,0.0,0.38944,30777.0,Blackburn with Darwen
4,30361032,6948,2017-01-01,01:14:00,4,85,0,1,0,61,...,1,13117,29898.296178,0.360292,0.0,0.103396,0.0,0.325164,30777.0,Blackburn with Darwen


## Saving data

In [40]:
print("Number of sites with motor vehicles merged: {}".format(len(merged_aadt_df_list)))

for merged_df in merged_aadt_df_list:
    merged_df.to_csv(PROCESSED_PATH+merged_df.name+'.csv')

Number of sites with motor vehicles merged: 16
