# Preprocessing

## Imports

In [188]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import torch
import random
import cv2
from tqdm import tqdm
from matplotlib import pyplot as plt
import segmentation_models_pytorch as smp
import albumentations as album
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from sklearn.metrics import accuracy_score
import plotly.express as px
import torchmetrics
from torchmetrics import MeanAbsolutePercentageError
from glob import glob

%matplotlib inline

## Global Variables

In [189]:
ROOT_DIR_PATH = os.path.abspath('..')

AADT_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/')

PROCESSED_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/processed/')

NN_MODEL_PATH = os.path.join(ROOT_DIR_PATH, "models/nn_aadt_model.pth")

NORMALISE = True

DROP_UNNORMALISE = False

REMOVE_METADATA = False

CHOSEN_COUNT_SITES = [('Luton', 'M1/2557A', 'M1/2557B'), ('Hounslow', 'M4/2188A', 'M4/2188B'), ('Enfield', 'M25/5441A', 'M25/5441B'), ('Blackburn with Darwen', '30361033', '30361032'), ('Havering', 'M25/5790A', 'M25/5790B')]

## General Functions

In [190]:
# Function used to normalsise the count data
def normalise(clean_report):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    for name in interger_cols:
        new_name = f"{name}_normalised"
        # for ever row in the report present the row's site id's mean volume
        mean = clean_report.groupby('site_id')[name].transform("mean")
        # normalise
        clean_report.loc[:, new_name] = clean_report[name] / mean
        # filter so we don't have rows with a small mean which causes a pole
    return clean_report[mean>1]

In [191]:
def drop_unnormalise(df):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    df = df.drop(columns=interger_cols, axis=1)
    return df

In [192]:
def drop_metadata(df):
    metadata = ['site_id', 'time_period_ending', 'time_interval', 'daily_count', 'report_date', 'site_name', 'timestamp']
    df = df.drop(columns=metadata, axis=1)
    return df

In [193]:
def load_aadt_data():

    pattern = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/aadt_*.csv')

    aadt_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
    print("AADT files: {}".format(aadt_file_paths))

    df_aadt_list = []

    for i in range(1, len(aadt_file_paths)):
        df_aadt = pd.read_csv(aadt_file_paths[i])
        df_aadt = df_aadt.dropna()
        df_aadt = df_aadt.loc[:, ~df_aadt.columns.str.contains('^Unnamed')]

        if NORMALISE:
            df_aadt = normalise(df_aadt)

        if DROP_UNNORMALISE:
            df_aadt = drop_unnormalise(df_aadt)

        if REMOVE_METADATA:
            df_aadt = drop_metadata(df_aadt)


        df_aadt_list.append(df_aadt)

    return df_aadt_list

In [194]:
def load_motor_vehicle_data():

    pattern = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/all_motor_vehicles_*.csv')

    motor_vehicle_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
    print("LA motor vehicle files: {}".format(motor_vehicle_file_paths))

    df_motor_vehicle_list = []

    for i in range(1, len(motor_vehicle_file_paths)):
        df_motor_vehicle = pd.read_csv(motor_vehicle_file_paths[i])
        df_motor_vehicle = df_motor_vehicle.loc[:, ~df_motor_vehicle.columns.str.contains('^Unnamed')]

        df_motor_vehicle_list.append(df_motor_vehicle)

    return df_motor_vehicle_list

In [195]:
def merge_dfs_from_lists(df_aadt_list, df_motor_vehicle_list, la_and_count_sites):

    merged_aadt_df_list = []

    for aadt_df in df_aadt_list:
        
        for motor_vehicle_df in df_motor_vehicle_list:

            for site in la_and_count_sites:

                (la_name, site_a, site_b) = site

                if la_name in motor_vehicle_df.iloc[0]['Local Authority'] and ( (site_a in aadt_df.iloc[0]['site_name']) or (site_b in aadt_df.iloc[0]['site_name']) ):

                    print("Entered if statement: {} {} {}".format(la_name, site_a, site_b))

                    year = aadt_df.iloc[0]['year']

                    all_motor_vehicles = motor_vehicle_df.loc[motor_vehicle_df['year'] == year].iloc[0]['all_motor_vehicles']

                    print("year: {}, all_motor_vehicles: {}".format(year, all_motor_vehicles))

                    merged_aadt_df = aadt_df.copy()

                    merged_aadt_df.name = 'aadt_'+la_name+'_'+aadt_df.iloc[0]['site_name'].replace('/', '_')+'_'+str(year)

                    merged_aadt_df['all_motor_vehicles'] = all_motor_vehicles
                    merged_aadt_df['Local Authority'] = motor_vehicle_df['Local Authority']

                    merged_aadt_df_list.append(merged_aadt_df)

    return merged_aadt_df_list

## Data

In [196]:
df_aadt_list = load_aadt_data()

df_aadt_list[0].head()

df_aadt_list[0].dtypes

AADT files: ['/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2557A_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M4_2188B_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M4_2188A_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2557B_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2557A_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M4_2188A_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M4_2188B_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2557B_year_2018.csv']


site_name                   object
site_id                      int64
report_date                 object
time_period_ending          object
time_interval                int64
0-520cm                      int64
521-660cm                    int64
661-1160cm                   int64
1160+cm                      int64
avg_mph                      int64
total_volume                 int64
timestamp                   object
year                         int64
month                        int64
day                          int64
hour                         int64
daily_count                  int64
aadt                       float64
0-520cm_normalised         float64
521-660cm_normalised       float64
661-1160cm_normalised      float64
1160+cm_normalised         float64
total_volume_normalised    float64
dtype: object

In [197]:
df_motor_vehicle_list = load_motor_vehicle_data()

df_motor_vehicle_list[0].head()

df_motor_vehicle_list[0].dtypes

LA motor vehicle files: ['/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Blackburn with Darwen.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Enfield.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Hounslow.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Luton.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/all_motor_vehicles_Havering.csv']


year                    int64
all_motor_vehicles    float64
Local Authority        object
dtype: object

## Standardisation

## Merge LA data

In [198]:
merged_aadt_df_list = merge_dfs_from_lists(df_aadt_list, df_motor_vehicle_list, CHOSEN_COUNT_SITES)

merged_aadt_df_list[0].head()

Entered if statement: Hounslow M4/2188A M4/2188B
year: 2017, all_motor_vehicles: 51913.5
Entered if statement: Hounslow M4/2188A M4/2188B
year: 2018, all_motor_vehicles: 51047.5
Entered if statement: Luton M1/2557A M1/2557B
year: 2017, all_motor_vehicles: 67952.0
Entered if statement: Luton M1/2557A M1/2557B
year: 2018, all_motor_vehicles: 70355.0
Entered if statement: Hounslow M4/2188A M4/2188B
year: 2017, all_motor_vehicles: 51913.5
Entered if statement: Hounslow M4/2188A M4/2188B
year: 2018, all_motor_vehicles: 51047.5
Entered if statement: Luton M1/2557A M1/2557B
year: 2018, all_motor_vehicles: 70355.0


Unnamed: 0,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,...,hour,daily_count,aadt,0-520cm_normalised,521-660cm_normalised,661-1160cm_normalised,1160+cm_normalised,total_volume_normalised,all_motor_vehicles,Local Authority
0,M4/2188B,110,2017-01-01,00:14:00,0,100,13,1,1,58,...,0,38056,46639.557522,0.247201,0.223741,0.072282,0.13476,0.23766,51913.5,Hounslow
1,M4/2188B,110,2017-01-01,00:29:00,1,99,7,0,2,61,...,0,38056,46639.557522,0.244729,0.120476,0.0,0.269521,0.223193,51913.5,Hounslow
2,M4/2188B,110,2017-01-01,00:44:00,2,91,12,1,2,61,...,0,38056,46639.557522,0.224953,0.20653,0.072282,0.269521,0.21906,51913.5,Hounslow
3,M4/2188B,110,2017-01-01,00:59:00,3,101,10,0,0,60,...,0,38056,46639.557522,0.249673,0.172109,0.0,0.0,0.229393,51913.5,Hounslow
4,M4/2188B,110,2017-01-01,01:14:00,4,128,12,1,1,58,...,1,38056,46639.557522,0.316418,0.20653,0.072282,0.13476,0.293458,51913.5,Hounslow


## Saving data

In [199]:
for merged_df in merged_aadt_df_list:
    merged_df.to_csv(PROCESSED_PATH+merged_df.name+'.csv')