# Preprocessing

## Imports

In [92]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import torch
import random
import cv2
from tqdm import tqdm
from matplotlib import pyplot as plt
import segmentation_models_pytorch as smp
import albumentations as album
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from sklearn.metrics import accuracy_score
import plotly.express as px
import torchmetrics
from torchmetrics import MeanAbsolutePercentageError
from glob import glob

%matplotlib inline

## Global Variables

In [93]:
ROOT_DIR_PATH = os.path.abspath('..')

AADT_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/')

AADT_PROCESSED_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/aadt_processed.csv')

NN_MODEL_PATH = os.path.join(ROOT_DIR_PATH, "models/nn_aadt_model.pth")

## Data

In [94]:
pattern = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/aadt_site_*.csv')

aadt_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
print("AADT files: {}".format(aadt_file_paths))

df = pd.read_csv(aadt_file_paths[0], index_col=0)

for i in range(1, len(aadt_file_paths)):
    df_aadt = pd.read_csv(aadt_file_paths[i])
    df = pd.concat([df, df_aadt], ignore_index=True)

AADT files: ['/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_site_7810.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_site_4466.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_site_2.csv']


In [95]:
df.head()

Unnamed: 0.1,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,...,1160+cm_normalised,total_volume_normalised,year,month,day,hour,daily_count,timestamp_min_max,aadt,Unnamed: 0
0,6783/1,7810,2019-01-01,00:14:00,0,1,0,0,0,45.0,...,0.0,0.015112,2019,1,1,0,3998,0.0,6335.063014,
1,6783/1,7810,2019-01-01,00:29:00,1,8,0,0,0,52.0,...,0.0,0.120898,2019,1,1,0,3998,0.010526,6335.063014,
2,6783/1,7810,2019-01-01,00:44:00,2,12,0,1,0,48.0,...,0.0,0.196459,2019,1,1,0,3998,0.021053,6335.063014,
3,6783/1,7810,2019-01-01,00:59:00,3,6,0,0,0,56.0,...,0.0,0.090673,2019,1,1,0,3998,0.031579,6335.063014,
4,6783/1,7810,2019-01-01,01:14:00,4,14,0,0,0,51.0,...,0.0,0.211571,2019,1,1,1,3998,0.042105,6335.063014,


## Corrupt values

In [96]:
df = df.dropna(axis=0)
df.isnull().sum()

site_name                  0
site_id                    0
report_date                0
time_period_ending         0
time_interval              0
0-520cm                    0
521-660cm                  0
661-1160cm                 0
1160+cm                    0
avg_mph                    0
total_volume               0
timestamp                  0
0-520cm_normalised         0
521-660cm_normalised       0
661-1160cm_normalised      0
1160+cm_normalised         0
total_volume_normalised    0
year                       0
month                      0
day                        0
hour                       0
daily_count                0
timestamp_min_max          0
aadt                       0
Unnamed: 0                 0
dtype: int64

## Saving data

In [97]:
df.to_csv(AADT_PROCESSED_PATH)