In [1]:
import pandas as pd
import csv
import os

In [2]:
#Activate the first option if whole column width is necessary and the second to make all columns visible
#pd.set_option('display.max_colwidth', -1)
#pd.set_option('display.max_columns', 999)

## I. Import the dataset and filter the relevant columns

In [26]:
#Get the report and store it into a dataframe. Check what columns are included in the file
report = pd.read_csv('4_Dtims_Jan2020_AADT.csv', dtype = str)
report.columns

Index(['OBJECTID', 'ASSET_ID', 'FEATURE_ID', 'SITE_CODE', 'SITE_NAME',
       'LOCALITY', 'TOWN', 'LOCATION', 'ROADCLASS', 'HIERARCHY', 'CUSTOMER',
       'AREA_NAME', 'DISTANCE', 'ROAD_TYPE', 'Shape_STLength_1', 'Confidence',
       'AADT_Comb', 'AADT_Source_Comb', 'AADT_Source_Type', 'ModifiedDate',
       'Checked', 'SurveyID', 'SurveyDate', 'Comments', 'Shape_Length_1',
       'Shape_Length'],
      dtype='object')

In [27]:
#Drop the irrelevant attributes, validated by an SME, and remove whitespaces
report.drop(columns = ['OBJECTID','ASSET_ID','LOCALITY','LOCATION','Shape_STLength_1', 'SurveyDate'], inplace = True ) 
#Get rid of all Shape legth

In [28]:
#Remove all whitespaces from column names
report.columns = [x.replace(" ", "") for x in report.columns]

In [29]:
#Validate with the SME that all the remaining columns are relevant.
report.columns

Index(['FEATURE_ID', 'SITE_CODE', 'SITE_NAME', 'TOWN', 'ROADCLASS',
       'HIERARCHY', 'AREA_NAME', 'DISTANCE', 'ROAD_TYPE', 'Confidence',
       'AADT_Comb', 'AADT_Source_Comb', 'AADT_Source_Type', 'ModifiedDate',
       'Checked', 'SurveyID', 'Comments', 'Shape_Length_1', 'Shape_Length'],
      dtype='object')

## II. Identify the key feature and format it to avoid empty cells and multiple values per row

In [30]:
#Drop all rows that are empty and drop all duplicate rows
report.dropna(axis=0, how='all', inplace=True)
report.drop_duplicates(inplace=True)

In [31]:
#Just to reuse the code of previous scripts
report_mod = report
report_mod.shape

(19602, 19)

In [32]:
#Count the number of cells that are null within the key feature
report_mod['FEATURE_ID'].isnull().sum()

0

In [38]:
#Count the number of cells that are null within the AADT_Count ***VALIDATE. WHAT DO WE DO WITH THE NULL VALUES?***
report_mod['AADT_Comb'].isnull().sum()

753

## 3. Filter the possible values for the relevant attributes

#### Road Class

In [33]:
#Check the possible values for the relevant features and their count and sort them:
report_mod.groupby(['ROADCLASS'],as_index = False).size().sort_values(ascending = False)

ROADCLASS
Unclassified Road - U           16617
Unnumbered Classified Rd - C     1223
Principal Road - A               1200
Numbered Classified Road - B      541
Not Allocated                      21
dtype: int64

#### Hierarchy

In [49]:
#Check the possible values for the relevant features and their count and sort them: ##VALIDATE WITH SME **GET DIR OF L3 Surfaced
report_mod.groupby(['HIERARCHY'],as_index = False).size().sort_values(ascending = False)

HIERARCHY
L2 Local Access             16588
L1 Local Distributor         1119
MN Main Distributor           847
SD Secondary Distributor      641
PR Primary Distributor        353
dtype: int64

In [48]:
#Drop the instances that are out of scope, categorized by feature
filter_list = ['L3 Surfaced PRoW','No Code Allocated']
report_mod = report_mod[~report_mod.HIERARCHY.isin(filter_list)]
report_mod = report_mod.reset_index(drop=True)

#### Road Type

In [37]:
#Check the possible values for the relevant features and their count and sort them: ##VALIDATE WITH SME
#Sli´p/Feeder both remove
report_mod.groupby(['ROAD_TYPE'],as_index = False).size().sort_values(ascending = False)

ROAD_TYPE
Single 2-Lane Carriageway         17598
Dual 2 Lane                         450
Single Lane Carriageway             425
Roundabout                          410
Carriageway Oneway 2-Lane           286
Carriageway Oneway 1-Lane           199
Slip/Feeder Road Oneway 2-Lane       50
Slip/Feeder Road Oneway 1-Lane       48
Carriageway Oxbow-Layby              48
Slip/Feeder Road Single 2-Lane       13
Dual 1 Lane                          13
Dual 3 Lane                          10
Carriageway Oneway 3-Lane             7
Single 3-Lane Carriageway             4
Slip/Feeder Road Dual 2-Lane          2
Bridleway                             2
dtype: int64

In [36]:
#Drop the instances that are out of scope, categorized by feature
filter_list = ['Footpath','BOAT','Shared Cycleway Footway']
report_mod = report_mod[~report_mod.ROAD_TYPE.isin(filter_list)]
report_mod = report_mod.reset_index(drop=True)

#### Confidence

In [39]:
#Check the possible values for the relevant features and their count and sort them: ##VALIDATE WITH SME
report_mod.groupby(['Confidence'],as_index = False).size().sort_values(ascending = False)

Confidence
Level 0    16125
Level 1     2431
Level 3      164
Level 2       65
2004          13
2014           7
2013           4
2012           4
dtype: int64

In [45]:
#Check the instances that correspond to the unusual confidence level **SEND TO TONY THE LEVEL NOT REGISTERED ()
filter_confidence = ['2014', '2004','2013','2012']
report_mod[report_mod['Confidence'].isin(filter_confidence)].head(10)


Unnamed: 0,FEATURE_ID,SITE_CODE,SITE_NAME,TOWN,ROADCLASS,HIERARCHY,AREA_NAME,DISTANCE,ROAD_TYPE,Confidence,AADT_Comb,AADT_Source_Comb,AADT_Source_Type,ModifiedDate,Checked,SurveyID,Comments,Shape_Length_1,Shape_Length
760,A120/110,12401698,Bishops Stortford Bypass,Bishop's Stortford,Principal Road - A,PR Primary Distributor,E: 3-East Herts District,1747,Single 2-Lane Carriageway,2013,18259,2013 site 599,Actual Count,2017-05-03 12:32,Y,,,1865.752903,1747.442706
766,B158/310,43070144,B158 between East Hertfordshire boundary and H...,Hatfield,Numbered Classified Road - B,SD Secondary Distributor,MW: 0-Welwyn Hatfield Area,565,Single 2-Lane Carriageway,2004,6120,2004 Traffic Counts,Actual Count,2017-05-03 12:12,Y,,,570.2611014,570.2611014
2068,A414/700,12414873,Rush Green Link,Hertford,Principal Road - A,PR Primary Distributor,E: 3-East Herts District,1343,Dual 2 Lane,2012,15480,2012 site 540 EB,Actual Count,2017-05-03 12:15,Y,,,1346.524698,1346.524698
2070,A414/701,12414873,Rush Green Link,Hertford,Principal Road - A,PR Primary Distributor,E: 3-East Herts District,1368,Dual 2 Lane,2012,18454,2012 site 540 WB,Actual Count,2017-05-03 12:15,Y,,,1356.797996,1356.797996
2117,B158/710,12420902,Wadesmill Road,Chapmore End,Numbered Classified Road - B,SD Secondary Distributor,E: 3-East Herts District,1563,Single 2-Lane Carriageway,2014,6290,2014 site 307,Actual Count,2017-05-03 12:42,Y,,,1552.809283,1552.809283
2204,B1004/920,12414954,Rye Street,Bishop's Stortford,Numbered Classified Road - B,SD Secondary Distributor,E: 3-East Herts District,189,Single 2-Lane Carriageway,2004,9348,2004 Traffic Counts,Actual Count,2017-05-03 12:37,Y,,,209.2632048,189.4853178
2206,B1037/110,12425007,Stevenage Road,Walkern,Numbered Classified Road - B,SD Secondary Distributor,E: 3-East Herts District,1516,Single 2-Lane Carriageway,2004,5646,2004 Traffic Counts,Actual Count,2017-05-03 14:03,Y,,,1515.989231,1515.989231
2214,B1038/50,12403065,B1038 From Its Junction With Biggin Hill To An...,Buntingford,Numbered Classified Road - B,SD Secondary Distributor,E: 3-East Herts District,257,Single 2-Lane Carriageway,2004,1862,2004 Traffic Counts,Actual Count,2017-05-03 12:47,Y,,,256.9900257,256.9900257
2224,B1368/30,12404960,Dassels Hill,Dassels,Numbered Classified Road - B,SD Secondary Distributor,E: 3-East Herts District,419,Single 2-Lane Carriageway,2004,2641,2004 Traffic Counts,Actual Count,2017-05-03 12:48,Y,,,406.5598586,406.5598586
2837,A1184/60,12410490,London Road,Spellbrook,Principal Road - A,MN Main Distributor,E: 3-East Herts District,645,Single 2-Lane Carriageway,2004,20024,2004 Traffic Counts,Actual Count,2017-05-03 12:31,Y,,,562.9987121,564.3757668


In [47]:
to_tony = report_mod[report_mod['Confidence'].isin(filter_confidence)]
to_tony.to_csv(r'C:\Users\J FernandezGomez\Jupyter Notebooks\Cleaning_Data_Notebooks\4_Traffic_Count_AADT\unusual_confidence_levels.csv')

#### Source Type

In [46]:
#Check the possible values for the relevant features and their count and sort them: ##VALIDATE WITH SME
report_mod.groupby(['AADT_Source_Type'],as_index = False).size().sort_values(ascending = False)


AADT_Source_Type
Default Value                          15092
Actual Count                            2021
Estimate                                1464
Mean from actuals on THIS link only      231
Local Estimate                             4
Local estimate                             1
dtype: int64

## Fill the blanks for rest of the categories with "Undefined"

In [210]:
report_mod['LatestWorksCost'].fillna(value = '0', inplace = True)
report_mod.fillna(value = 'Undefined', inplace = True)

### Save to a new CSV file 

In [212]:
report_mod.to_csv(r'C:\Users\J FernandezGomez\Jupyter Notebooks\Cleaning_Data_Notebooks\4_Traffic_Count_AADT\AADT_processed_v1.csv')