# Data cleaning - new data obtained at 20190728

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
# %qtconsole

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)
			
import pandas as pd
import numpy as np
from pathlib import Path		

import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')

In [3]:
# ref: https://gis.stackexchange.com/questions/225065/converting-nztm-new-zealand-transverse-mercator-to-lat-long
import math
def nztm_to_lat_long(nztm_e, nztm_n):
    """ converting nztm to latitude and longitude
        #input Northing(Y); Easting(X) variables
    """
    #Common variables for NZTM2000
    a = 6378137;
    f = 1 / 298.257222101;
    phizero = 0;
    lambdazero = 173;
    Nzero = 10000000;
    Ezero = 1600000;
    kzero = 0.9996;   

    #input Northing(Y); Easting(X) variables
    N  = int(nztm_n);
    E  = int(nztm_e);

    #Calculation: From NZTM to lat/Long
    b = a * (1 - f);
    esq = 2 * f - f ** 2;
    Z0 = 1 - esq / 4 - 3 * (esq ** 2) / 64 - 5 * (esq ** 3) / 256;
    A2 = 0.375 * (esq + esq ** 2 / 4 + 15 * (esq ** 3) / 128);
    A4 = 15 * ((esq ** 2) + 3 * (esq ** 3) / 4) / 256;
    A6 = 35 * (esq ** 3) / 3072;

    Nprime = N - Nzero;
    mprime = Nprime / kzero;
    smn = (a - b) / (a + b);
    G = a * (1 - smn) * (1 - (smn ** 2)) * (1 + 9 * (smn ** 2) / 4 + 225 * (smn ** 4) / 64) * math.pi/ 180.0;
    sigma = mprime * math.pi / (180 * G);
    phiprime = sigma + (3 * smn / 2 - 27 * (smn ** 3) / 32) * math.sin(2 * sigma) + (21 * (smn ** 2) / 16 - 55 * (smn ** 4) / 32) * math.sin(4 * sigma) + (151 * (smn ** 3) / 96) * math.sin(6 * sigma) + (1097 * (smn ** 4) / 512) *math.sin(8 * sigma);
    rhoprime = a * (1 - esq) / ((1 - esq * ((math.sin(phiprime)) ** 2)) ** 1.5);
    upsilonprime = a / math.sqrt(1 - esq * ((math.sin(phiprime)) ** 2));

    psiprime = upsilonprime / rhoprime;
    tprime = math.tan(phiprime);
    Eprime = E - Ezero;
    chi = Eprime / (kzero * upsilonprime);
    term_1 = tprime * Eprime * chi / (kzero * rhoprime * 2);
    term_2 = term_1 * (chi ** 2) / 12 * (-4 * (psiprime ** 2) + 9 * psiprime * (1 - (tprime ** 2)) + 12 * (tprime ** 2));
    term_3 = tprime * Eprime * (chi ** 5) / (kzero * rhoprime * 720) * (8 * (psiprime ** 4) * (11 - 24 * (tprime ** 2)) - 12 * (psiprime ** 3) * (21 - 71 * (tprime ** 2)) + 15 * (psiprime ** 2) * (15 - 98 * (tprime ** 2) + 15 * (tprime ** 4)) + 180 * psiprime * (5 * (tprime ** 2) - 3 * (tprime ** 4)) + 360 * (tprime ** 4));
    term_4 = tprime * Eprime * (chi ** 7) / (kzero * rhoprime * 40320) * (1385 + 3633 * (tprime ** 2) + 4095 * (tprime ** 4) + 1575 * (tprime ** 6));
    term1 = chi * (1 / math.cos(phiprime));
    term2 = (chi ** 3) * (1 / math.cos(phiprime)) / 6 * (psiprime + 2 * (tprime ** 2));
    term3 = (chi ** 5) * (1 / math.cos(phiprime)) / 120 * (-4 * (psiprime ** 3) * (1 - 6 * (tprime ** 2)) + (psiprime ** 2) * (9 - 68 * (tprime ** 2)) + 72 * psiprime * (tprime ** 2) + 24 * (tprime ** 4));
    term4 = (chi ** 7) * (1 / math.cos(phiprime)) / 5040 * (61 + 662 * (tprime ** 2) + 1320 * (tprime ** 4) + 720 * (tprime ** 6));

    latitude = (phiprime - term_1 + term_2 - term_3 + term_4) * 180 / math.pi;
    longitude = lambdazero + 180 / math.pi * (term1 - term2 + term3 - term4);  

#         outfile.write("{}, {}, {}\n".format(site_id, latitude, longitude))
    return latitude, longitude

In [36]:
df = pd.read_csv(r'data/Average_Daily_Traffic_Counts_20190728.csv')
display_all(df.head())

Unnamed: 0,X,Y,OBJECTID,carr_way_no,road_id,road_name,start_name,end_name,location,latest,count_date,peak_hour,count_duration,adt,peaktraffic,pccar,pclcv,pcmcv,pchcvi,pchcvii,pcbus,pcheavy,NZTMX,NZTMY
0,1753117.0,5915068.0,4001,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1986-06-25T00:00:00.000Z,16:15,,,,,,,,,,,1753117.0,5915068.0
1,1753117.0,5915068.0,4002,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1984-06-20T00:00:00.000Z,16:15,,,,,,,,,,,1753117.0,5915068.0
2,1753117.0,5915068.0,4003,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1984-06-15T00:00:00.000Z,16:15,,,,,,,,,,,1753117.0,5915068.0
3,1753117.0,5915068.0,4004,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1993-10-27T00:00:00.000Z,17:45,,,,,,,,,,,1753117.0,5915068.0
4,1753117.0,5915068.0,4005,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1993-10-27T00:00:00.000Z,17:45,,,,,,,,,,,1753117.0,5915068.0


In [45]:
np.equal(df.X, df.NZTMX).sum()
np.equal(df.Y, df.NZTMY).sum()
# some of the X and Y is not the same as NZTMX and NZTMY
# in this project we use NZTMX and NZTMY for simplicity, 

40878

52568

In [47]:
df.shape

(52967, 24)

In [37]:
df_road = pd.read_csv(r'data/Traffic_Management_Levels_20190728.csv')
df_road.head()

Unnamed: 0,OBJECTID,carr_way_no,road_id,road_name,start_name,end_name,cway_width,traff_manage_level,Shape__Length
0,2001,41976,41096,BABICH RD,WIDTH CHANGE (END K&C RHS),END,5.0,"< 10,000 vpd Rural/<15,000 vpd Urban",636.519615
1,2002,32589,41433,BABICH RD NORTH,METCALFE RD,PLATINUM RISE,10.0,<500 vpd (Low Volume),72.580121
2,2003,32605,41433,BABICH RD NORTH,PLATINUM RISE,SAUVIGNON AVE,10.0,<500 vpd (Low Volume),122.037043
3,2004,32606,41433,BABICH RD NORTH,SAUVIGNON AVE,END OF STAGE 1,10.0,<500 vpd (Low Volume),76.091004
4,2005,34909,41433,BABICH RD NORTH,END OF STAGE 1,END OF STAGE 2,10.0,<500 vpd (Low Volume),163.637715


In [38]:
df.columns
df_road.columns
[col for col in df.columns if col in df_road.columns]
# OBJECTID is the index, 
# carr_way_no is the id for a part of the road e.g. a raod can have many carriage way
# road id 

Index(['X', 'Y', 'OBJECTID', 'carr_way_no', 'road_id', 'road_name',
       'start_name', 'end_name', 'location', 'latest', 'count_date',
       'peak_hour', 'count_duration', 'adt', 'peaktraffic', 'pccar', 'pclcv',
       'pcmcv', 'pchcvi', 'pchcvii', 'pcbus', 'pcheavy', 'NZTMX', 'NZTMY'],
      dtype='object')

Index(['OBJECTID', 'carr_way_no', 'road_id', 'road_name', 'start_name',
       'end_name', 'cway_width', 'traff_manage_level', 'Shape__Length'],
      dtype='object')

['OBJECTID', 'carr_way_no', 'road_id', 'road_name', 'start_name', 'end_name']

In [39]:
df.shape
df_road.shape
df.isna().sum()
df_road.isna().sum()

(52967, 24)

(37474, 9)

X                     0
Y                     0
OBJECTID              0
carr_way_no         252
road_id               0
road_name             0
start_name          515
end_name            586
location             81
latest                0
count_date            0
peak_hour         26452
count_duration    24116
adt                8151
peaktraffic       32121
pccar             23699
pclcv             23699
pcmcv             23703
pchcvi            23699
pchcvii           24633
pcbus             24237
pcheavy           22462
NZTMX                 0
NZTMY                 0
dtype: int64

OBJECTID                0
carr_way_no             0
road_id                 0
road_name               0
start_name            264
end_name              297
cway_width              0
traff_manage_level    179
Shape__Length           0
dtype: int64

In [11]:
cols = [col for col in df.columns if col in df_road.columns]
df2 = df.merge(df_road, how = 'left', on = cols)
display_all(df2.head())

Unnamed: 0,X,Y,OBJECTID,carr_way_no,road_id,road_name,start_name,end_name,location,latest,count_date,peak_hour,count_duration,adt,peaktraffic,pccar,pclcv,pcmcv,pchcvi,pchcvii,pcbus,pcheavy,NZTMX,NZTMY,cway_width,traff_manage_level,Shape__Length
0,1765712.0,5878405.0,12001,103.0,70623,WAIUKU RD,ATTEWELL RD,TRAMWAY RD,2041.0,No,2000-11-07T00:00:00.000Z,,,5759.0,,,,,,,,,1765712.0,5878405.0,,,
1,1765346.0,5878127.0,12002,103.0,70623,WAIUKU RD,ATTEWELL RD,TRAMWAY RD,2501.0,No,2001-11-22T00:00:00.000Z,08:00,,5932.0,490.0,,,,,,,,1765346.0,5878127.0,,,
2,1765127.0,5877958.0,12003,103.0,70623,WAIUKU RD,ATTEWELL RD,TRAMWAY RD,2778.0,No,2002-01-16T00:00:00.000Z,16:00,,7333.0,607.0,,,,,,,,1765127.0,5877958.0,,,
3,1765127.0,5877958.0,12004,103.0,70623,WAIUKU RD,ATTEWELL RD,TRAMWAY RD,2778.0,No,2002-04-05T00:00:00.000Z,16:00,,5737.0,489.0,,,,,,,,1765127.0,5877958.0,,,
4,1765347.0,5878128.0,12005,103.0,70623,WAIUKU RD,ATTEWELL RD,TRAMWAY RD,2500.0,No,2011-02-15T00:00:00.000Z,0800,,7354.0,766.0,91.0,2.0,4.0,1.0,2.0,0.0,7.0,1765347.0,5878128.0,,,


In [21]:
cols

['OBJECTID', 'carr_way_no', 'road_id', 'road_name', 'start_name', 'end_name']

In [40]:
# cols = ['carr_way_no','road_id', 'road_name', 'start_name', 'end_name']
cols = ['carr_way_no', 'road_id', 'road_name', 'start_name', 'end_name']

In [42]:
df_test = df.merge(df_road, how = 'left', on = cols)
display_all(df_test.head())
df_test.shape

Unnamed: 0,X,Y,OBJECTID_x,carr_way_no,road_id,road_name,start_name,end_name,location,latest,count_date,peak_hour,count_duration,adt,peaktraffic,pccar,pclcv,pcmcv,pchcvi,pchcvii,pcbus,pcheavy,NZTMX,NZTMY,OBJECTID_y,cway_width,traff_manage_level,Shape__Length
0,1753117.0,5915068.0,4001,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1986-06-25T00:00:00.000Z,16:15,,,,,,,,,,,1753117.0,5915068.0,26927.0,12.2,">= 15,000 vpd Urban road, low speed",179.995992
1,1753117.0,5915068.0,4002,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1984-06-20T00:00:00.000Z,16:15,,,,,,,,,,,1753117.0,5915068.0,26927.0,12.2,">= 15,000 vpd Urban road, low speed",179.995992
2,1753117.0,5915068.0,4003,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1984-06-15T00:00:00.000Z,16:15,,,,,,,,,,,1753117.0,5915068.0,26927.0,12.2,">= 15,000 vpd Urban road, low speed",179.995992
3,1753117.0,5915068.0,4004,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1993-10-27T00:00:00.000Z,17:45,,,,,,,,,,,1753117.0,5915068.0,26927.0,12.2,">= 15,000 vpd Urban road, low speed",179.995992
4,1753117.0,5915068.0,4005,14721.0,31125,RICHARDSON RD (OWAIRAKA),HENDON AVE,O'DONNELL AVE,1218.0,No,1993-10-27T00:00:00.000Z,17:45,,,,,,,,,,,1753117.0,5915068.0,26927.0,12.2,">= 15,000 vpd Urban road, low speed",179.995992


(52967, 28)

In [43]:
df_test.isna().sum()

X                         0
Y                         0
OBJECTID_x                0
carr_way_no             252
road_id                   0
road_name                 0
start_name              515
end_name                586
location                 81
latest                    0
count_date                0
peak_hour             26452
count_duration        24116
adt                    8151
peaktraffic           32121
pccar                 23699
pclcv                 23699
pcmcv                 23703
pchcvi                23699
pchcvii               24633
pcbus                 24237
pcheavy               22462
NZTMX                     0
NZTMY                     0
OBJECTID_y              284
cway_width              284
traff_manage_level      284
Shape__Length           284
dtype: int64

In [13]:
df2.shape

(52580, 27)

In [18]:
display_all(df2[df2.latest == 'Yes'].head())
# not sure what latest mean

Unnamed: 0,X,Y,OBJECTID,carr_way_no,road_id,road_name,start_name,end_name,location,latest,count_date,peak_hour,count_duration,adt,peaktraffic,pccar,pclcv,pcmcv,pchcvi,pchcvii,pcbus,pcheavy,NZTMX,NZTMY,cway_width,traff_manage_level,Shape__Length
6,1764597.0,5877756.0,12007,35095.0,70623,WAIUKU RD,START SCHOOL ZONE,AKA AKA RD,3540.0,Yes,2009-04-30T00:00:00.000Z,08:00,,6875.0,757.0,92.0,3.0,3.0,1.0,1.0,0.0,5.0,1764597.0,5877756.0,,,
8,1763030.0,5878304.0,12009,99.0,70623,WAIUKU RD,WILY RD,WALLER RD,5203.0,Yes,2004-11-11T00:00:00.000Z,16:00,,4856.0,547.0,0.0,94.0,4.0,1.0,1.0,0.0,6.0,1763030.0,5878304.0,,,
20,1759332.0,5879754.0,12021,91.0,70623,WAIUKU RD,GLENBROOK STATION RD (POLE LHS),MORLEY RD (POWERPOLE RHS),9205.0,Yes,2000-10-22T00:00:00.000Z,,,5688.0,,,,,,,,,1759332.0,5879754.0,,,
110,1730113.0,5938962.0,12111,4217.0,11079,WISHART RD,RURAL START,RURAL END,223.0,Yes,2010-08-20T00:00:00.000Z,17:00,7D,1129.0,127.0,95.0,2.0,1.0,2.0,0.0,0.0,3.0,1730113.0,5938962.0,,,
112,1731901.0,5937459.0,12113,35157.0,11079,WISHART RD,RURAL END,INLAND RD,2945.0,Yes,2011-03-23T00:00:00.000Z,17:00,7D,1077.0,115.0,95.0,2.0,1.0,2.0,0.0,0.0,3.0,1731901.0,5937459.0,,,


In [19]:
df2.isna().sum()

X                         0
Y                         0
OBJECTID                  0
carr_way_no             232
road_id                   0
road_name                 0
start_name              492
end_name                566
location                 78
latest                    0
count_date                0
peak_hour             26452
count_duration        24116
adt                    8152
peaktraffic           32121
pccar                 23699
pclcv                 23699
pcmcv                 23703
pchcvi                23699
pchcvii               24633
pcbus                 24237
pcheavy               22462
NZTMX                     0
NZTMY                     0
cway_width            52578
traff_manage_level    52578
Shape__Length         52578
dtype: int64