# Summary
Process NBI for state bridges
- Filters:
    - structures in VA
    - excludes colverts
    - 10-year window minimum
    - no reconstruction
- Target categories:
    - Sufficient (suff. rating >= 80%)
    - Monitor (suff. rating < 80% and >= 50%)
    - Critical (suff. rating < 50%)

# Imports

In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data

In [2]:
# feature data year
feat_year = 2007
# year for target outcome
tar_year = 2017

In [3]:
local_path = 'data/VA/'
# files = glob.glob(local_path + '*.txt')
# files.sort()
feat_file = 'VA07.txt'
tar_file = 'VA17.txt'

In [4]:
df_feat_raw = pd.read_csv(local_path + feat_file)
df_feat_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,PLACE_CODE_004,...,PROJ_NO,PROJ_SUFFIX,NBI_TYPE_OF_IMP,DTL_TYPE_OF_IMP,SPECIAL_CODE,STEP_CODE,STATUS_WITH_10YR_RULE,SUFFICIENCY_ASTERC,SUFFICIENCY_RATING,STATUS_NO_10YR_RULE
0,51,1VA0069,1,8,8.0,0,0,5,710.0,51087.0,...,,,,,,,1,,12.0,1
1,51,1VA0075,1,8,0.0,0,0,0,199.0,0.0,...,,,,,,,0,,93.0,0
2,51,1VA0079,1,8,8.0,0,0,5,710.0,51087.0,...,,,,,,,0,,76.7,0
3,51,1VA0158,1,8,0.0,0,0,0,99.0,21008.0,...,,,,,,,1,,13.5,1
4,51,1VA0159,1,8,0.0,0,0,0,99.0,21008.0,...,,,,,,,0,,89.2,0


In [5]:
df_tar_raw = pd.read_csv(local_path + tar_file)
df_tar_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,PLACE_CODE_004,...,DTL_TYPE_OF_IMP,SPECIAL_CODE,STEP_CODE,STATUS_WITH_10YR_RULE,SUFFICIENCY_ASTERC,SUFFICIENCY_RATING,STATUS_NO_10YR_RULE,CAT10,CAT23,CAT29
0,51,1VA0075,1,8,0,0,0,0,199,0,...,,,,0,,93.0,0,G,7,558.72
1,51,1VA0158,1,8,0,0,0,0,99,21008,...,,,,1,,13.0,1,P,4,877.8
2,51,1VA0159,1,8,0,0,0,0,99,21008,...,,,,0,,90.2,0,F,6,186.24
3,51,1VA0174,1,8,0,0,0,0,740,710,...,,,,0,,99.0,0,G,7,141.52
4,51,1VA0257,1,8,0,0,0,0,810,51128,...,,,,0,,100.0,0,G,7,1314.87


## Assign

In [6]:
df_feat = df_feat_raw.copy()
df_tar = df_tar_raw.copy()

# Filter

## Filter out culverts

In [7]:
# remove observations coded with 19 (culvert) in item 43B
df_feat = df_feat[df_feat['STRUCTURE_TYPE_043B'] != 19]
df_tar = df_tar[df_tar['STRUCTURE_TYPE_043B'] != 19]

## Filter out duplicate records

In [8]:
# remove non-1 codes for item 5A (record type)
df_feat = df_feat[df_feat['RECORD_TYPE_005A'] == 1]
df_tar = df_tar[df_tar['RECORD_TYPE_005A'] == 1]

# Merge

In [9]:
# specify target columns to merge
tar_cols = [
    'STATE_CODE_001', 
    'STRUCTURE_NUMBER_008',
    'YEAR_RECONSTRUCTED_106',
    'DECK_COND_058',
    'SUPERSTRUCTURE_COND_059',
    'SUBSTRUCTURE_COND_060', 
    'SUFFICIENCY_RATING'
]

In [10]:
# drop target values from feature dataframe
df_feat.drop(columns=[
    'DECK_COND_058', 
    'SUPERSTRUCTURE_COND_059', 
    'SUBSTRUCTURE_COND_060', 
    'SUFFICIENCY_RATING'
], inplace=True)

In [11]:
df_bridges = pd.merge(
    df_feat, 
    df_tar[tar_cols], 
    how='left', 
    on=['STATE_CODE_001', 'STRUCTURE_NUMBER_008'], 
    suffixes=('_feat_yr', '_tar_yr')
)
df_bridges.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,PLACE_CODE_004,...,SPECIAL_CODE,STEP_CODE,STATUS_WITH_10YR_RULE,SUFFICIENCY_ASTERC,STATUS_NO_10YR_RULE,YEAR_RECONSTRUCTED_106_tar_yr,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,SUFFICIENCY_RATING
0,51,1VA0069,1,8,8.0,0,0,5,710.0,51087.0,...,,,1,,1,,,,,
1,51,1VA0075,1,8,0.0,0,0,0,199.0,0.0,...,,,0,,0,,7.0,7.0,7.0,93.0
2,51,1VA0079,1,8,8.0,0,0,5,710.0,51087.0,...,,,0,,0,,,,,
3,51,1VA0158,1,8,0.0,0,0,0,99.0,21008.0,...,,,1,,1,1987.0,5.0,4.0,4.0,13.0
4,51,1VA0159,1,8,0.0,0,0,0,99.0,21008.0,...,,,0,,0,,7.0,6.0,7.0,90.2


## Filter out reconstructed

In [12]:
# remove bridges reconstructed after feature dataset year
df_bridges = df_bridges[df_bridges['YEAR_RECONSTRUCTED_106_tar_yr'] <= feat_year]
df_bridges.drop(columns='YEAR_RECONSTRUCTED_106_tar_yr', inplace=True)
df_bridges.rename(columns={'YEAR_RECONSTRUCTED_106_feat_yr': 'YEAR_RECONSTRUCTED_106'}, inplace=True)

## Drop Duplicates

In [13]:
# drop any remaining duplicate records
df_bridges.drop_duplicates(subset=['STATE_CODE_001', 'STRUCTURE_NUMBER_008'], inplace=True)

# Pre-processing

## Feature Data

In [14]:
# id columns
id_cols = [
    'STATE_CODE_001', 
    'STRUCTURE_NUMBER_008',
    'COUNTY_CODE_003',
    'FEATURES_DESC_006A',
    'FACILITY_CARRIED_007',
    'LOCATION_009'
]

In [15]:
# numeric features
num_cols = [
    'MIN_VERT_CLR_010',  
    'DETOUR_KILOS_019', 
    'YEAR_BUILT_027', # modify to age at record year
    'TRAFFIC_LANES_ON_028A', 
    'TRAFFIC_LANES_UND_028B',
    'ADT_029',
    'APPR_WIDTH_MT_032',
    'DEGREES_SKEW_034',
    'NAV_VERT_CLR_MT_039', 
    'NAV_HORR_CLR_MT_040',
    'MAIN_UNIT_SPANS_045', 
    'APPR_SPANS_046',
    'HORR_CLR_MT_047', 
    'MAX_SPAN_LEN_MT_048', 
    'STRUCTURE_LEN_MT_049',
    'LEFT_CURB_MT_050A', 
    'RIGHT_CURB_MT_050B', 
    'ROADWAY_WIDTH_MT_051',
    'DECK_WIDTH_MT_052', 
    'VERT_CLR_OVER_MT_053', 
    'VERT_CLR_UND_054B', 
    'LAT_UND_REF_055A', 
    'LAT_UND_MT_055B',
    'LEFT_LAT_UND_MT_056',
    'YEAR_RECONSTRUCTED_106' # modify to years since reconstruction
]

In [16]:
# categorical features
cat_cols = [
    'TOLL_020',
    'MAINTENANCE_021',
    'FUNCTIONAL_CLASS_026',
    'DESIGN_LOAD_031',
    'MEDIAN_CODE_033', 
    'STRUCTURE_FLARED_035',
    'RAILINGS_036A',
    'TRANSITIONS_036B', 
    'APPR_RAIL_036C',
    'APPR_RAIL_END_036D', 
    'HISTORY_037', 
    'NAVIGATION_038',
    'OPEN_CLOSED_POSTED_041',
    'SERVICE_ON_042A', 
    'SERVICE_UND_042B', 
    'STRUCTURE_KIND_043A',
    'STRUCTURE_TYPE_043B', 
    'APPR_KIND_044A', 
    'APPR_TYPE_044B',
    'DECK_STRUCTURE_TYPE_107', 
    'SURFACE_TYPE_108A', 
    'MEMBRANE_TYPE_108B',
    'DECK_PROTECTION_108C', 
    'PERCENT_ADT_TRUCK_109'
]

In [18]:
# reduce to specified, ordered columns
df_bridges = df_bridges[id_cols + num_cols + cat_cols + \
    ['DECK_COND_058',
    'SUPERSTRUCTURE_COND_059',
    'SUBSTRUCTURE_COND_060', 
    'SUFFICIENCY_RATING']]
df_bridges.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,COUNTY_CODE_003,FEATURES_DESC_006A,FACILITY_CARRIED_007,LOCATION_009,MIN_VERT_CLR_010,DETOUR_KILOS_019,YEAR_BUILT_027,TRAFFIC_LANES_ON_028A,...,APPR_TYPE_044B,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,PERCENT_ADT_TRUCK_109,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,SUFFICIENCY_RATING
3,51,1VA0158,99.0,'Gambo Creek ','Tisdale Rd ','1km N of Dahlgren Rd ',99.99,5.0,1940.0,2.0,...,0.0,1,0,0,0,5.0,5,4,4,13.0
6,51,1VA0575,740.0,'Dale St. & N&P RR ','Williams Avenue ','At Gate 36 and Elm Ave. ',99.99,1.0,1942.0,2.0,...,1.0,1,6,0,0,5.0,5,3,5,7.0
7,51,1VA0591,810.0,'Drainage Canal ','Golf Cart Path ','Near 9th Hole ',99.99,1.0,1965.0,2.0,...,0.0,1,0,0,0,0.0,5,5,6,48.0
9,51,1VA2106,810.0,'Lake Whitehurst Outlet ','Guam Road ','1 KM NW of Ferry Rd ',99.99,5.0,1954.0,2.0,...,0.0,1,0,0,0,5.0,5,6,6,79.5
10,51,1VA2107,810.0,'Eastern Shore RR ','Amphibious Drive ','0.2 KM W of Abbott Rd ',99.99,2.0,1955.0,2.0,...,2.0,1,0,0,0,5.0,6,5,6,63.4


## Replace Year with Age

In [19]:
# years since construction
df_bridges['YEAR_BUILT_027'] = feat_year - df_bridges['YEAR_BUILT_027']
# rename column
df_bridges.rename(columns={'YEAR_BUILT_027': 'AGE'}, inplace=True)

In [22]:
# years since reconstruction
df_bridges['YEAR_RECONSTRUCTED_106'] = df_bridges.apply(
    lambda x: np.where(x['YEAR_RECONSTRUCTED_106'] == 0, 
                       x['AGE'], 
                       feat_year - x['YEAR_RECONSTRUCTED_106']), 
    axis=1)
# rename column
df_bridges.rename(columns={'YEAR_RECONSTRUCTED_106': 'RECON_AGE'}, inplace=True)

In [23]:
df_bridges.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,COUNTY_CODE_003,FEATURES_DESC_006A,FACILITY_CARRIED_007,LOCATION_009,MIN_VERT_CLR_010,DETOUR_KILOS_019,AGE,TRAFFIC_LANES_ON_028A,...,APPR_TYPE_044B,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,PERCENT_ADT_TRUCK_109,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,SUFFICIENCY_RATING
3,51,1VA0158,99.0,'Gambo Creek ','Tisdale Rd ','1km N of Dahlgren Rd ',99.99,5.0,67.0,2.0,...,0.0,1,0,0,0,5.0,5,4,4,13.0
6,51,1VA0575,740.0,'Dale St. & N&P RR ','Williams Avenue ','At Gate 36 and Elm Ave. ',99.99,1.0,65.0,2.0,...,1.0,1,6,0,0,5.0,5,3,5,7.0
7,51,1VA0591,810.0,'Drainage Canal ','Golf Cart Path ','Near 9th Hole ',99.99,1.0,42.0,2.0,...,0.0,1,0,0,0,0.0,5,5,6,48.0
9,51,1VA2106,810.0,'Lake Whitehurst Outlet ','Guam Road ','1 KM NW of Ferry Rd ',99.99,5.0,53.0,2.0,...,0.0,1,0,0,0,5.0,5,6,6,79.5
10,51,1VA2107,810.0,'Eastern Shore RR ','Amphibious Drive ','0.2 KM W of Abbott Rd ',99.99,2.0,52.0,2.0,...,2.0,1,0,0,0,5.0,6,5,6,63.4


# Export dataframe

In [7]:
# ID column for index
bridge_id = df_bridges['STATE_CODE_001'].astype(str) + \
'-' + df_bridges['STRUCTURE_NUMBER_008'].astype(str).apply(lambda x: x.strip())
# set index to unique structure id
df_bridges.set_index(bridge_id, inplace=True)
df_bridges.head()

Unnamed: 0,COUNTY_CODE_003,FEATURES_DESC_006A,FACILITY_CARRIED_007,LOCATION_009,MIN_VERT_CLR_010,DETOUR_KILOS_019,TOLL_020,MAINTENANCE_021,FUNCTIONAL_CLASS_026,YEAR_BUILT_027,...,VERT_CLR_UND_054B,LAT_UND_REF_055A,LAT_UND_MT_055B,LEFT_LAT_UND_MT_056,YEAR_RECONSTRUCTED_106,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,PERCENT_ADT_TRUCK_109
51-1VA0069,710.0,'Willoughby Bay ',"'""A"" Street '",'1.2 KM from Gate 4A ',99.99,1.0,3.0,73.0,19.0,1941.0,...,0.0,N,0.0,0.0,0.0,1,0,0,0,1.0
51-1VA0075,199.0,'Felgates Creek ','Felgates Rd ','0.5km SW of West Rd ',99.99,5.0,3.0,73.0,9.0,2005.0,...,0.0,N,0.0,0.0,0.0,1,6,0,0,5.0
51-1VA0079,710.0,'Boush Creeek ','Patrol Road ','East of I-564 at 3rd Ave ',99.99,199.0,3.0,73.0,19.0,1952.0,...,0.0,N,0.0,0.0,0.0,1,0,0,0,10.0
51-1VA0158,99.0,'Gambo Creek ','Tisdale Rd ','1km N of Dahlgren Rd ',99.99,5.0,3.0,73.0,9.0,1940.0,...,0.0,N,0.0,0.0,1987.0,1,0,0,0,5.0
51-1VA0159,99.0,'Gambo Creek ','Bagby Rd ','200m NE of Caskey Rd ',99.99,5.0,3.0,73.0,19.0,1991.0,...,0.0,N,0.0,0.0,0.0,1,0,0,0,5.0


In [None]:
df_bridges.to_csv('data/bridges.csv')