# Summary
Process NBI for state bridges
- Filters:
    - structures in VA
    - excludes culverts
    - 10-year window minimum
    - no reconsted bridges after feature input year

# Imports

In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data

In [2]:
# feature data year
feat_year = 2007
# year for target outcome
tar_year = 2017

In [3]:
local_path = 'data/VA/'
# files = glob.glob(local_path + '*.txt')
# files.sort()
feat_file = 'VA07.txt'
tar_file = 'VA17.txt'

In [4]:
df_feat_raw = pd.read_csv(local_path + feat_file)
df_feat_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,PLACE_CODE_004,...,PROJ_NO,PROJ_SUFFIX,NBI_TYPE_OF_IMP,DTL_TYPE_OF_IMP,SPECIAL_CODE,STEP_CODE,STATUS_WITH_10YR_RULE,SUFFICIENCY_ASTERC,SUFFICIENCY_RATING,STATUS_NO_10YR_RULE
0,51,1VA0069,1,8,8.0,0,0,5,710.0,51087.0,...,,,,,,,1,,12.0,1
1,51,1VA0075,1,8,0.0,0,0,0,199.0,0.0,...,,,,,,,0,,93.0,0
2,51,1VA0079,1,8,8.0,0,0,5,710.0,51087.0,...,,,,,,,0,,76.7,0
3,51,1VA0158,1,8,0.0,0,0,0,99.0,21008.0,...,,,,,,,1,,13.5,1
4,51,1VA0159,1,8,0.0,0,0,0,99.0,21008.0,...,,,,,,,0,,89.2,0


In [5]:
df_tar_raw = pd.read_csv(local_path + tar_file)
df_tar_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,PLACE_CODE_004,...,DTL_TYPE_OF_IMP,SPECIAL_CODE,STEP_CODE,STATUS_WITH_10YR_RULE,SUFFICIENCY_ASTERC,SUFFICIENCY_RATING,STATUS_NO_10YR_RULE,CAT10,CAT23,CAT29
0,51,1VA0075,1,8,0,0,0,0,199,0,...,,,,0,,93.0,0,G,7,558.72
1,51,1VA0158,1,8,0,0,0,0,99,21008,...,,,,1,,13.0,1,P,4,877.8
2,51,1VA0159,1,8,0,0,0,0,99,21008,...,,,,0,,90.2,0,F,6,186.24
3,51,1VA0174,1,8,0,0,0,0,740,710,...,,,,0,,99.0,0,G,7,141.52
4,51,1VA0257,1,8,0,0,0,0,810,51128,...,,,,0,,100.0,0,G,7,1314.87


## Assign

In [6]:
df_feat = df_feat_raw.copy()
df_tar = df_tar_raw.copy()

# Filter

## Filter out culverts

In [7]:
# remove observations coded with 19 (culvert) in item 43B
df_feat = df_feat[df_feat['STRUCTURE_TYPE_043B'] != 19]
df_tar = df_tar[df_tar['STRUCTURE_TYPE_043B'] != 19]

## Filter out duplicate records

In [8]:
# remove non-1 codes for item 5A (record type)
df_feat = df_feat[df_feat['RECORD_TYPE_005A'] == 1]
df_tar = df_tar[df_tar['RECORD_TYPE_005A'] == 1]

# Merge

In [9]:
# specify target columns to merge
tar_cols = [
    'STATE_CODE_001', 
    'STRUCTURE_NUMBER_008',
    'YEAR_RECONSTRUCTED_106',
#     'DECK_COND_058',
#     'SUPERSTRUCTURE_COND_059',
#     'SUBSTRUCTURE_COND_060', 
    'SUFFICIENCY_RATING'
]

In [10]:
# drop target values from feature dataframe
# df_feat.drop(columns=[
#     'DECK_COND_058', 
#     'SUPERSTRUCTURE_COND_059', 
#     'SUBSTRUCTURE_COND_060', 
#     'SUFFICIENCY_RATING'
# ], inplace=True)

In [11]:
df_bridges = pd.merge(
    df_feat, 
    df_tar[tar_cols], 
    how='left', 
    on=['STATE_CODE_001', 'STRUCTURE_NUMBER_008'], 
    suffixes=('_feat_yr', '_tar_yr')
)
df_bridges.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,PLACE_CODE_004,...,NBI_TYPE_OF_IMP,DTL_TYPE_OF_IMP,SPECIAL_CODE,STEP_CODE,STATUS_WITH_10YR_RULE,SUFFICIENCY_ASTERC,SUFFICIENCY_RATING_feat_yr,STATUS_NO_10YR_RULE,YEAR_RECONSTRUCTED_106_tar_yr,SUFFICIENCY_RATING_tar_yr
0,51,1VA0069,1,8,8.0,0,0,5,710.0,51087.0,...,,,,,1,,12.0,1,,
1,51,1VA0075,1,8,0.0,0,0,0,199.0,0.0,...,,,,,0,,93.0,0,,93.0
2,51,1VA0079,1,8,8.0,0,0,5,710.0,51087.0,...,,,,,0,,76.7,0,,
3,51,1VA0158,1,8,0.0,0,0,0,99.0,21008.0,...,,,,,1,,13.5,1,1987.0,13.0
4,51,1VA0159,1,8,0.0,0,0,0,99.0,21008.0,...,,,,,0,,89.2,0,,90.2


## Filter out reconstructed

In [12]:
# remove bridges reconstructed after feature dataset year
df_bridges = df_bridges[df_bridges['YEAR_RECONSTRUCTED_106_tar_yr'] <= feat_year].copy()
df_bridges.drop(columns='YEAR_RECONSTRUCTED_106_tar_yr', inplace=True)
df_bridges.rename(columns={'YEAR_RECONSTRUCTED_106_feat_yr': 'YEAR_RECONSTRUCTED_106'}, inplace=True)

## Drop Duplicates

In [13]:
# drop any remaining duplicate records
df_bridges.drop_duplicates(subset=['STATE_CODE_001', 'STRUCTURE_NUMBER_008'], inplace=True)

# Feature Data

In [14]:
# id columns
id_cols = [
    'STATE_CODE_001', 
    'STRUCTURE_NUMBER_008',
    'COUNTY_CODE_003',
    'FEATURES_DESC_006A',
    'FACILITY_CARRIED_007',
    'LOCATION_009'
]

In [15]:
# numeric features
num_cols = [
    'MIN_VERT_CLR_010',  
    'DETOUR_KILOS_019', 
    'YEAR_BUILT_027', # modify to age at record year
    'TRAFFIC_LANES_ON_028A', 
    'TRAFFIC_LANES_UND_028B',
    'ADT_029',
    'APPR_WIDTH_MT_032',
    'DEGREES_SKEW_034',
    'NAV_VERT_CLR_MT_039', 
    'NAV_HORR_CLR_MT_040',
    'MAIN_UNIT_SPANS_045', 
    'APPR_SPANS_046',
    'HORR_CLR_MT_047', 
    'MAX_SPAN_LEN_MT_048', 
    'STRUCTURE_LEN_MT_049',
    'LEFT_CURB_MT_050A', 
    'RIGHT_CURB_MT_050B', 
    'ROADWAY_WIDTH_MT_051',
    'DECK_WIDTH_MT_052', 
    'VERT_CLR_OVER_MT_053', 
    'VERT_CLR_UND_054B', 
    'LAT_UND_MT_055B',
    'LEFT_LAT_UND_MT_056',
    'YEAR_RECONSTRUCTED_106', # modify to years since reconstruction
    'PERCENT_ADT_TRUCK_109',
    'SUFFICIENCY_RATING_feat_yr'
]

In [16]:
# categorical features
cat_cols = [
    'TOLL_020',
    'MAINTENANCE_021',
    'FUNCTIONAL_CLASS_026',
    'DESIGN_LOAD_031',
    'MEDIAN_CODE_033', 
    'STRUCTURE_FLARED_035',
    'RAILINGS_036A',
    'TRANSITIONS_036B', 
    'APPR_RAIL_036C',
    'APPR_RAIL_END_036D', 
    'HISTORY_037', 
    'NAVIGATION_038',
    'OPEN_CLOSED_POSTED_041',
    'SERVICE_ON_042A', 
    'SERVICE_UND_042B', 
    'STRUCTURE_KIND_043A',
    'STRUCTURE_TYPE_043B', 
    'APPR_KIND_044A', 
    'APPR_TYPE_044B',
    'DECK_STRUCTURE_TYPE_107', 
    'SURFACE_TYPE_108A', 
    'MEMBRANE_TYPE_108B',
    'DECK_PROTECTION_108C',
    'DECK_COND_058',
    'SUPERSTRUCTURE_COND_059',
    'SUBSTRUCTURE_COND_060'
]

In [18]:
# reduce to specified, ordered columns
df_bridges = df_bridges[id_cols + num_cols + cat_cols + ['SUFFICIENCY_RATING_tar_yr']]
df_bridges.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,COUNTY_CODE_003,FEATURES_DESC_006A,FACILITY_CARRIED_007,LOCATION_009,MIN_VERT_CLR_010,DETOUR_KILOS_019,YEAR_BUILT_027,TRAFFIC_LANES_ON_028A,...,APPR_KIND_044A,APPR_TYPE_044B,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,SUFFICIENCY_RATING_tar_yr
3,51,1VA0158,99.0,'Gambo Creek ','Tisdale Rd ','1km N of Dahlgren Rd ',99.99,5.0,1940.0,2.0,...,0.0,0.0,1,0,0,0,5,4,4,13.0
6,51,1VA0575,740.0,'Dale St. & N&P RR ','Williams Avenue ','At Gate 36 and Elm Ave. ',99.99,1.0,1942.0,2.0,...,5.0,1.0,1,6,0,0,5,5,5,7.0
7,51,1VA0591,810.0,'Drainage Canal ','Golf Cart Path ','Near 9th Hole ',99.99,1.0,1965.0,2.0,...,0.0,0.0,1,0,0,0,5,5,6,48.0
9,51,1VA2106,810.0,'Lake Whitehurst Outlet ','Guam Road ','1 KM NW of Ferry Rd ',99.99,5.0,1954.0,2.0,...,0.0,0.0,1,0,0,0,5,6,5,79.5
10,51,1VA2107,810.0,'Eastern Shore RR ','Amphibious Drive ','0.2 KM W of Abbott Rd ',99.99,2.0,1955.0,2.0,...,3.0,2.0,1,0,0,0,5,5,4,63.4


## Data Types

In [19]:
# change numeric values to int or float types
for col in num_cols:
    df_bridges[col] = pd.to_numeric(df_bridges[col], errors='coerce')

In [20]:
# change categorical values to string type
df_bridges[cat_cols] = df_bridges[cat_cols].astype(str)

In [21]:
df_bridges.dtypes

STATE_CODE_001                  int64
STRUCTURE_NUMBER_008           object
COUNTY_CODE_003               float64
FEATURES_DESC_006A             object
FACILITY_CARRIED_007           object
LOCATION_009                   object
MIN_VERT_CLR_010              float64
DETOUR_KILOS_019              float64
YEAR_BUILT_027                float64
TRAFFIC_LANES_ON_028A         float64
TRAFFIC_LANES_UND_028B          int64
ADT_029                       float64
APPR_WIDTH_MT_032             float64
DEGREES_SKEW_034              float64
NAV_VERT_CLR_MT_039           float64
NAV_HORR_CLR_MT_040           float64
MAIN_UNIT_SPANS_045           float64
APPR_SPANS_046                float64
HORR_CLR_MT_047               float64
MAX_SPAN_LEN_MT_048           float64
STRUCTURE_LEN_MT_049          float64
LEFT_CURB_MT_050A             float64
RIGHT_CURB_MT_050B            float64
ROADWAY_WIDTH_MT_051          float64
DECK_WIDTH_MT_052             float64
VERT_CLR_OVER_MT_053          float64
VERT_CLR_UND

## Replace Year with Age

In [22]:
# years since construction
df_bridges['YEAR_BUILT_027'] = feat_year - df_bridges['YEAR_BUILT_027']
# rename column
df_bridges.rename(columns={'YEAR_BUILT_027': 'AGE'}, inplace=True)

In [23]:
# years since reconstruction
df_bridges['YEAR_RECONSTRUCTED_106'] = df_bridges.apply(
    lambda x: np.where(x['YEAR_RECONSTRUCTED_106'] == 0, 
                       x['AGE'], 
                       feat_year - x['YEAR_RECONSTRUCTED_106']), 
    axis=1)
# rename column
df_bridges.rename(columns={'YEAR_RECONSTRUCTED_106': 'RECON_AGE'}, inplace=True)

In [24]:
df_bridges.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,COUNTY_CODE_003,FEATURES_DESC_006A,FACILITY_CARRIED_007,LOCATION_009,MIN_VERT_CLR_010,DETOUR_KILOS_019,AGE,TRAFFIC_LANES_ON_028A,...,APPR_KIND_044A,APPR_TYPE_044B,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,SUFFICIENCY_RATING_tar_yr
3,51,1VA0158,99.0,'Gambo Creek ','Tisdale Rd ','1km N of Dahlgren Rd ',99.99,5.0,67.0,2.0,...,0.0,0.0,1,0,0,0,5,4,4,13.0
6,51,1VA0575,740.0,'Dale St. & N&P RR ','Williams Avenue ','At Gate 36 and Elm Ave. ',99.99,1.0,65.0,2.0,...,5.0,1.0,1,6,0,0,5,5,5,7.0
7,51,1VA0591,810.0,'Drainage Canal ','Golf Cart Path ','Near 9th Hole ',99.99,1.0,42.0,2.0,...,0.0,0.0,1,0,0,0,5,5,6,48.0
9,51,1VA2106,810.0,'Lake Whitehurst Outlet ','Guam Road ','1 KM NW of Ferry Rd ',99.99,5.0,53.0,2.0,...,0.0,0.0,1,0,0,0,5,6,5,79.5
10,51,1VA2107,810.0,'Eastern Shore RR ','Amphibious Drive ','0.2 KM W of Abbott Rd ',99.99,2.0,52.0,2.0,...,3.0,2.0,1,0,0,0,5,5,4,63.4


# NaNs

In [25]:
df_bridges.isna().sum()

STATE_CODE_001                  0
STRUCTURE_NUMBER_008            0
COUNTY_CODE_003                 0
FEATURES_DESC_006A              0
FACILITY_CARRIED_007            0
LOCATION_009                    0
MIN_VERT_CLR_010                0
DETOUR_KILOS_019                0
AGE                             0
TRAFFIC_LANES_ON_028A           0
TRAFFIC_LANES_UND_028B          0
ADT_029                         0
APPR_WIDTH_MT_032               0
DEGREES_SKEW_034                0
NAV_VERT_CLR_MT_039             0
NAV_HORR_CLR_MT_040             0
MAIN_UNIT_SPANS_045             0
APPR_SPANS_046                  0
HORR_CLR_MT_047                 0
MAX_SPAN_LEN_MT_048             0
STRUCTURE_LEN_MT_049            0
LEFT_CURB_MT_050A               0
RIGHT_CURB_MT_050B              0
ROADWAY_WIDTH_MT_051            0
DECK_WIDTH_MT_052               0
VERT_CLR_OVER_MT_053            0
VERT_CLR_UND_054B               0
LAT_UND_MT_055B                 0
LEFT_LAT_UND_MT_056             0
RECON_AGE     

## Impute Mean

In [26]:
# fill nans with mean value
df_bridges['PERCENT_ADT_TRUCK_109'].fillna(df_bridges['PERCENT_ADT_TRUCK_109'].mean(), inplace=True)

# Data Overview

In [27]:
df_bridges.describe()

Unnamed: 0,STATE_CODE_001,COUNTY_CODE_003,MIN_VERT_CLR_010,DETOUR_KILOS_019,AGE,TRAFFIC_LANES_ON_028A,TRAFFIC_LANES_UND_028B,ADT_029,APPR_WIDTH_MT_032,DEGREES_SKEW_034,...,RIGHT_CURB_MT_050B,ROADWAY_WIDTH_MT_051,DECK_WIDTH_MT_052,VERT_CLR_OVER_MT_053,VERT_CLR_UND_054B,LAT_UND_MT_055B,LEFT_LAT_UND_MT_056,PERCENT_ADT_TRUCK_109,SUFFICIENCY_RATING_feat_yr,SUFFICIENCY_RATING_tar_yr
count,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,...,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0,7803.0
mean,51.0,188.539408,98.583021,21.382801,43.743945,2.24875,1.016276,9229.727541,9.845713,14.428425,...,0.294874,10.237921,11.549443,98.713872,1.835967,1.621453,3.087569,4.286804,77.828899,76.094694
std,0.0,225.102586,11.558915,46.768112,20.697837,1.078095,2.105855,19413.242958,6.322974,18.913546,...,0.48228,6.055308,7.004536,10.877358,2.921786,5.734694,14.98896,5.160483,18.161447,19.252393
min,51.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,2.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,51.0,45.0,99.99,2.0,29.0,2.0,0.0,319.0,6.1,0.0,...,0.0,7.0,7.8,99.99,0.0,0.0,0.0,1.0,68.4,67.0
50%,51.0,117.0,99.99,6.0,41.0,2.0,0.0,1699.0,7.9,3.0,...,0.2,8.4,9.4,99.99,0.0,0.0,0.0,2.0,81.4,80.5
75%,51.0,177.0,99.99,14.0,57.0,2.0,1.0,9984.0,11.9,30.0,...,0.3,12.2,13.4,99.99,4.87,2.3,0.0,5.0,92.2,90.9
max,51.0,840.0,99.99,199.0,187.0,14.0,18.0,212073.0,146.3,99.0,...,4.4,69.9,153.3,99.99,30.17,99.9,99.9,98.0,100.0,100.0


# Export dataframe

In [28]:
# ID column for index
bridge_id = df_bridges['STATE_CODE_001'].astype(str) + \
'-' + df_bridges['STRUCTURE_NUMBER_008'].astype(str).apply(lambda x: x.strip())
# set index to unique structure id
df_bridges.set_index(bridge_id, inplace=True)
# drop columns used for id
df_bridges.drop(columns=['STATE_CODE_001', 'STRUCTURE_NUMBER_008'], inplace=True)
df_bridges.head()

Unnamed: 0,COUNTY_CODE_003,FEATURES_DESC_006A,FACILITY_CARRIED_007,LOCATION_009,MIN_VERT_CLR_010,DETOUR_KILOS_019,AGE,TRAFFIC_LANES_ON_028A,TRAFFIC_LANES_UND_028B,ADT_029,...,APPR_KIND_044A,APPR_TYPE_044B,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060,SUFFICIENCY_RATING_tar_yr
51-1VA0158,99.0,'Gambo Creek ','Tisdale Rd ','1km N of Dahlgren Rd ',99.99,5.0,67.0,2.0,0,1650.0,...,0.0,0.0,1,0,0,0,5,4,4,13.0
51-1VA0575,740.0,'Dale St. & N&P RR ','Williams Avenue ','At Gate 36 and Elm Ave. ',99.99,1.0,65.0,2.0,2,5000.0,...,5.0,1.0,1,6,0,0,5,5,5,7.0
51-1VA0591,810.0,'Drainage Canal ','Golf Cart Path ','Near 9th Hole ',99.99,1.0,42.0,2.0,0,50.0,...,0.0,0.0,1,0,0,0,5,5,6,48.0
51-1VA2106,810.0,'Lake Whitehurst Outlet ','Guam Road ','1 KM NW of Ferry Rd ',99.99,5.0,53.0,2.0,0,500.0,...,0.0,0.0,1,0,0,0,5,6,5,79.5
51-1VA2107,810.0,'Eastern Shore RR ','Amphibious Drive ','0.2 KM W of Abbott Rd ',99.99,2.0,52.0,2.0,2,4000.0,...,3.0,2.0,1,0,0,0,5,5,4,63.4


In [29]:
df_bridges.to_csv('data/bridges.csv')