In [1]:
import pandas as pd
import numpy as np

# Read in Data

In [2]:
# specify states for bridges
states = ['DE']
# specify years for analysis
years = range(2008, 2018)
# specify rating threshold
rating_threshold = 7

In [3]:
# specify filenames to read in
file_state_years = [
    '10fluna_992008-20160912090515.txt',
    '10fluna_992009-20160913064846.txt',
    'DE10.txt',
    'DE11.txt',
    'DE2012.txt',
    'DE13.txt',
    'DE14.txt',
    'DE15.txt',
    'DE16.TXT',
    'DE17.txt'
]

# Build Condition Rating Dataframe

In [4]:
extract = [
    'STRUCTURE_NUMBER_008',
    'STATE_CODE_001',
    'DECK_COND_058'
]

In [5]:
# define for loop to read data for all years
df = {}

for year, file in zip(years, file_state_years):
    df[year] = pd.read_csv('data/nbi/' + str(year) + 'del/' + file)
    df[year] = df[year][extract]

In [6]:
# initialize dataframe for ratings by year
df_ratings = df[year][['STRUCTURE_NUMBER_008', 'STATE_CODE_001']]

In [7]:
# input ratings for each year
for year in years:
    df_ratings[year] = df[year]['DECK_COND_058']

In [41]:
# find bridges not in another year
df[years[5]]['STRUCTURE_NUMBER_008'][~df[years[5]]['STRUCTURE_NUMBER_008'].isin(df[years[0]]['STRUCTURE_NUMBER_008'])]

50           1096 243 
51           1097 243 
127          1246 347 
141          1267 026 
187          1325 397 
210          1367 000 
249          1465 000 
268          1536 231 
280          1572 211 
300          1623 000 
331          1683 029 
373          1735I6121
485          1912N083 
486          1912S083 
505          2008E010A
516          2013 000 
522          2017B017 
578          2130A140 
615          2266A266 
706          3142 571 
759          3344 062 
764          3351 072 
766          3359 446 
776          3423 366 
798          3524 083 
830          3706 000 
854          3930 225 
863    1902 082       
Name: STRUCTURE_NUMBER_008, dtype: object

In [78]:
(df_ratings.iloc[:, 
                            -len(years):][2008].apply(pd.to_numeric, 
                              errors='coerce') >= rating_threshold).astype(int)

0      0
1      0
2      0
3      1
4      1
5      1
6      1
7      0
8      0
9      0
10     0
11     0
12     0
13     1
14     0
15     0
16     1
17     0
18     0
19     1
20     0
21     0
22     0
23     0
24     0
25     0
26     1
27     0
28     1
29     1
      ..
849    1
850    1
851    0
852    0
853    0
854    1
855    0
856    0
857    1
858    0
859    0
860    0
861    0
862    0
863    0
864    0
865    0
866    0
867    0
868    0
869    0
870    0
871    0
872    0
873    0
874    0
875    0
876    0
877    0
878    0
Name: 2008, Length: 879, dtype: int64

In [9]:
# create dataframe of 1's and 0's dependent on values above or below condition rating threshold

df_change_year = pd.DataFrame(columns = years)

for year in years:
    df_change_year[year] = (df_ratings.iloc[:, 
                            -len(years):][year].apply(pd.to_numeric, 
                              errors='coerce') >= rating_threshold).astype(int)

# take difference for adjacent years to show when rating drops below threshold (-1)
df_change_year = df_change_year.diff(axis=1)
df_change_year['change_yr'] = ""

In [10]:
# fill column with year that rating first fell belwo threshold
for year in years: 
    df_change_year['change_yr'] = np.where(df_change_year['change_yr'] == "", 
                                           np.where(df_change_year[year] == -1, year, ""), 
                                           df_change_year['change_yr'])  

In [11]:
df_change_year

Unnamed: 0,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,change_yr
0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,2011
5,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
7,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
8,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


# ID Parameters
- 008
    - struct number, 15-digit
- 007
    - facility carried, 18-digit
- 006A
    - feature intersected, 24-digit
- 016
    - latitude (XX deg XX min XX.XX sec), 8-digit
- 017
    - longitude (XXX deg XX min XX.XX sec), 9-digit
- 001
    - state code, 3-digit
- 003
    - county code, 3-digit
- 026
    - functional classification, 2-digit - rural/urban and use

# Independent variables
Filter dataset for desired features for analysis. Items:
- 021
    - maintenance responsibility - categorical
- 027
    - year, 4-digit - feature engineer age from year as numerical
- 029
    - ADT, 6-digit - use to feature engineer ADTT value
- 109
    - ADTT (%), 2-digit - feature engineer value as numerical
- 028A
    - lanes on struct, 2-digit - numerical
- 041
    - posting, 1-digit - categorical
- 043A
    - material, 1-digit - categorical
- 043B
    - design type, 1-digit - categorical
- 045
    - num spans, 3-digit - numerical
- 048
    - max span length (meters), 5-digit (1 decimal) - numerical
- 049
    - struct length (meters), 6-digit (1 decimal) - numerical
- 052
    - deck width (meters), 4-digit (1 decimal) - numerical
- 054B
    - min underclearance (meters), 4-digit (2 decimal) - numerical
- 061
    - channel, 1-digit - categorical
- 107
    - deck type, 1-digit - categorical
- 108A
    - wearing surface, 1-digit - categorical
- 108B
    - membrane, 1-digit - categorical
- 108C
    - deck protection, 1-digit - categorical

# Dependent variables
Bridge performance metrics. Items:
- 058
    - deck rating, 1-digit
- 059
    - super rating, 1-digit
- 060
    - sub rating, 1-digit

# Data Filtering

In [16]:
df[2008].shape

(1150, 3)

## Structure Types

- 042A
    - type of service, 1-digit  
    
Filter to return observations that are highway or interchange structures (codes = 1, 6, 7 and 8)

In [5]:
df_filt = df[(df['SERVICE_ON_042A'] == 1) | (df['SERVICE_ON_042A'] == 6) | 
(df['SERVICE_ON_042A'] == 7) | (df['SERVICE_ON_042A'] == 8)]

- 043B
    - design type, 2-digit  

Filter out all observations that are culverts (code = 19)

In [10]:
df_filt = df_filt[df_filt['STRUCTURE_TYPE_043B'] != 19]

In [11]:
df_filt.shape

(478, 137)

## Condition Ratings
Remove observations with no structural condition ratings (code = 'N')

In [8]:
# check deck ratings
df_filt['DECK_COND_058'].unique()

array(['6', '7', '8', 'N', '5', '4', '9', '3'], dtype=object)

In [9]:
df_filt['DECK_COND_058'].value_counts()

7    244
6    143
5     44
8     25
N     12
9      5
4      4
3      1
Name: DECK_COND_058, dtype: int64

In [10]:
df_filt = df_filt[df_filt['DECK_COND_058'] != 'N']

In [11]:
df_filt['DECK_COND_058'].unique()

array(['6', '7', '8', '5', '4', '9', '3'], dtype=object)

In [12]:
# check superstructure ratings
df_filt['SUPERSTRUCTURE_COND_059'].unique()

array(['8', '7', '5', '6', '4', '9'], dtype=object)

In [13]:
# check substructure ratings
df_filt['SUBSTRUCTURE_COND_060'].unique()

array(['6', '7', '5', '4', '9', '8'], dtype=object)

## Features
Return only desired features from dataset

In [14]:
col_id = [
    'STRUCTURE_NUMBER_008',
    'FACILITY_CARRIED_007',
    'FEATURES_DESC_006A',
    'LAT_016',
    'LONG_017',
    'STATE_CODE_001',
    'COUNTY_CODE_003',
    'FUNCTIONAL_CLASS_026'
]

In [15]:
col_indep = [
    'MAINTENANCE_021',
    'YEAR_BUILT_027',
    'ADT_029',
    'PERCENT_ADT_TRUCK_109',
    'TRAFFIC_LANES_ON_028A',
    'OPEN_CLOSED_POSTED_041',
    'STRUCTURE_KIND_043A', 
    'STRUCTURE_TYPE_043B',
    'MAIN_UNIT_SPANS_045',
    'MAX_SPAN_LEN_MT_048',
    'STRUCTURE_LEN_MT_049',
    'DECK_WIDTH_MT_052',
    'VERT_CLR_UND_054B',
    'CHANNEL_COND_061',
    'YEAR_RECONSTRUCTED_106',
    'DECK_STRUCTURE_TYPE_107', 
    'SURFACE_TYPE_108A', 
    'MEMBRANE_TYPE_108B',
    'DECK_PROTECTION_108C'
]

In [16]:
col_dep = [
    'DECK_COND_058', 
    'SUPERSTRUCTURE_COND_059', 
    'SUBSTRUCTURE_COND_060'
]

In [17]:
cols = col_id + col_indep + col_dep
df_filt = df_filt[cols]

In [18]:
df_filt.shape

(466, 30)

## NaN Values

In [19]:
# return column and number of NaN values
for col in df_filt.columns:
    if df_filt[col].isnull().values.any() == True:
        print(col)
        print(sum(df_filt[col].isnull()))
    else:
        pass

PERCENT_ADT_TRUCK_109
1


In [20]:
# remove any observations with NaN values
df_filt = df_filt.dropna()

In [21]:
df_filt.shape

(465, 30)

In [28]:
df_filt.head()

Unnamed: 0,STRUCTURE_NUMBER_008,FACILITY_CARRIED_007,FEATURES_DESC_006A,LAT_016,LONG_017,STATE_CODE_001,COUNTY_CODE_003,FUNCTIONAL_CLASS_026,MAINTENANCE_021,YEAR_BUILT_027,...,VERT_CLR_UND_054B,CHANNEL_COND_061,YEAR_RECONSTRUCTED_106,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060
4,1003 225,'THOMPSON BRIDGE RD','BRANDYWINE CREEK ',39490118,75341160,10,3,7,1,1990,...,0.0,6,0,1,1,0,1,6,8,6
5,1008 221,'BRANDYWINE CREEK R','TRIB. TO BRANDYWINE ',39500587,75343303,10,3,9,1,1960,...,0.0,6,0,1,6,0,0,7,7,7
6,1009 221,'SMITH BRIDGE RD ','BRANDYWINE CREEK ',39501584,75344477,10,3,19,1,1839,...,0.0,7,2002,8,7,0,N,7,5,7
13,1020 232,'ROCKLAND RD ','HUSBANDS RUN ',39471013,75334227,10,3,17,1,1932,...,0.0,7,1993,1,6,0,1,7,7,5
14,1023A231,'GUYENCOURT RD. ','READING RR ',39485945,75350660,10,3,9,1,1918,...,6.59,N,1982,8,7,0,0,7,7,5


# Finalize Data

In [23]:
df_fin = df_filt.copy()

## Create ID

In [25]:
# define identification tag for each structure
ID = df_fin['STATE_CODE_001'].astype(str) + "-" + df_fin['STRUCTURE_NUMBER_008'].apply(lambda x: x.strip())

In [26]:
df_fin.insert(0, 'ID', ID)

## Numeric Datatypes

In [29]:
col_num = [
    'LAT_016',
    'LONG_017',
    'YEAR_BUILT_027',
    'YEAR_RECONSTRUCTED_106',
    'ADT_029',
    'PERCENT_ADT_TRUCK_109',
    'TRAFFIC_LANES_ON_028A',
    'MAIN_UNIT_SPANS_045',
    'MAX_SPAN_LEN_MT_048',
    'STRUCTURE_LEN_MT_049',
    'DECK_WIDTH_MT_052',
    'VERT_CLR_UND_054B',
    'DECK_COND_058', 
    'SUPERSTRUCTURE_COND_059', 
    'SUBSTRUCTURE_COND_060'
]

In [30]:
for col in col_num:
    print(df_fin[col].dtype)

int64
int64
int64
int64
int64
float64
int64
int64
float64
float64
float64
float64
object
object
object


In [31]:
# convert object types to int
df_fin['DECK_COND_058'] = pd.to_numeric(df_fin['DECK_COND_058'])
df_fin['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(df_fin['SUPERSTRUCTURE_COND_059'])
df_fin['SUBSTRUCTURE_COND_060'] = pd.to_numeric(df_fin['SUBSTRUCTURE_COND_060'])

## Feature Engineering

### Age and Reconstruction
Replace year built and year reconstructed with years since

In [32]:
year_base = 2017

In [51]:
df_fin['YEAR_BUILT_027'] = year_base - df_fin['YEAR_BUILT_027']
df_fin.rename(columns={'YEAR_BUILT_027': 'AGE'}, inplace=True)

In [54]:
df_fin['YEARS_SINCE_RECON'] = df_fin.apply(lambda x: x['AGE'] 
             if x['YEAR_RECONSTRUCTED_106'] == 0 else year_base - x['YEAR_RECONSTRUCTED_106'], axis=1);

### ADTT
Replace percentage of ADT trucks with ADTT value

In [55]:
df_fin['PERCENT_ADT_TRUCK_109'] = df_fin['PERCENT_ADT_TRUCK_109'] * df_fin['ADT_029']
df_fin.rename(columns={'PERCENT_ADT_TRUCK_109': 'ADTT'}, inplace=True)

### Location

In [29]:
# may use later
# calc distance using Lat and Long values

## Define numerical feature dataframe

In [56]:
df_num = df_fin[[
    'ID',
    'AGE',
    'YEARS_SINCE_RECON',
    'ADT_029',
    'ADTT',
    'TRAFFIC_LANES_ON_028A',
    'MAIN_UNIT_SPANS_045',
    'MAX_SPAN_LEN_MT_048',
    'STRUCTURE_LEN_MT_049',
    'DECK_WIDTH_MT_052',
    'VERT_CLR_UND_054B'
]]

In [57]:
df_num.shape

(465, 11)

In [58]:
df_num.head()

Unnamed: 0,ID,AGE,YEARS_SINCE_RECON,ADT_029,ADTT,TRAFFIC_LANES_ON_028A,MAIN_UNIT_SPANS_045,MAX_SPAN_LEN_MT_048,STRUCTURE_LEN_MT_049,DECK_WIDTH_MT_052,VERT_CLR_UND_054B
4,10-1003 225,28,28,6814,34070.0,2,2,28.4,59.2,14.3,0.0
5,10-1008 221,58,58,2473,22257.0,2,1,12.5,13.3,9.1,0.0
6,10-1009 221,179,16,2029,22319.0,1,3,13.7,43.6,5.2,0.0
13,10-1020 232,86,25,3712,22272.0,2,1,7.7,8.4,8.3,0.0
14,10-1023A231,100,36,364,4368.0,2,3,7.3,20.4,6.3,6.59


## Categorical Values

In [59]:
col_cat = [
    'MAINTENANCE_021',
    'OPEN_CLOSED_POSTED_041',
    'STRUCTURE_KIND_043A', 
    'STRUCTURE_TYPE_043B',
    'CHANNEL_COND_061',
    'DECK_STRUCTURE_TYPE_107', 
    'SURFACE_TYPE_108A', 
    'MEMBRANE_TYPE_108B',
    'DECK_PROTECTION_108C'
]

In [60]:
# examine categorical values
for col in col_cat:
    print(col)
    print(df_fin[col].unique())

MAINTENANCE_021
[ 1 11  4 32 70]
OPEN_CLOSED_POSTED_041
['A' 'P' 'K']
STRUCTURE_KIND_043A
[5 3 4 1 7 6 2 0]
STRUCTURE_TYPE_043B
[ 5  2  1  6  3 10  4 13 15 14 17 12]
CHANNEL_COND_061
['6' '7' 'N' '8' '4' '5' '9']
DECK_STRUCTURE_TYPE_107
['1' '8' '9' '2' 'N' '3']
SURFACE_TYPE_108A
['1' '6' '7' '0' '5' '3' '2' '9' 'N']
MEMBRANE_TYPE_108B
['0' '9' '3' 'N' '1' '2']
DECK_PROTECTION_108C
['1' '0' 'N' '3' '9' '2']


In [61]:
df_cat = df_fin[col_cat]

In [62]:
df_cat.shape

(465, 9)

In [63]:
df_cat.head()

Unnamed: 0,MAINTENANCE_021,OPEN_CLOSED_POSTED_041,STRUCTURE_KIND_043A,STRUCTURE_TYPE_043B,CHANNEL_COND_061,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C
4,1,A,5,5,6,1,1,0,1
5,1,A,3,2,6,1,6,0,0
6,1,A,4,2,7,8,7,0,N
13,1,A,1,1,7,1,6,0,1
14,1,P,7,2,N,8,7,0,0


## Define categorical feature dataframe

In [64]:
# create sparse matrix for categorical values and add to feature matrix
for col in col_cat:
    cat_df = pd.get_dummies(df_cat[col], prefix=col)
    df_cat = df_cat.join(cat_df)

df_cat = df_cat.iloc[:, 9:]

In [65]:
df_cat.insert(0, 'ID', ID)

In [66]:
df_cat.head()

Unnamed: 0,ID,MAINTENANCE_021_1,MAINTENANCE_021_4,MAINTENANCE_021_11,MAINTENANCE_021_32,MAINTENANCE_021_70,OPEN_CLOSED_POSTED_041_A,OPEN_CLOSED_POSTED_041_K,OPEN_CLOSED_POSTED_041_P,STRUCTURE_KIND_043A_0,...,MEMBRANE_TYPE_108B_2,MEMBRANE_TYPE_108B_3,MEMBRANE_TYPE_108B_9,MEMBRANE_TYPE_108B_N,DECK_PROTECTION_108C_0,DECK_PROTECTION_108C_1,DECK_PROTECTION_108C_2,DECK_PROTECTION_108C_3,DECK_PROTECTION_108C_9,DECK_PROTECTION_108C_N
4,10-1003 225,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,10-1008 221,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,10-1009 221,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
13,10-1020 232,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
14,10-1023A231,1,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


## Define target feature dataframe

In [67]:
Y = df_fin[[    
    'DECK_COND_058', 
    'SUPERSTRUCTURE_COND_059', 
    'SUBSTRUCTURE_COND_060'
]]

In [68]:
Y.insert(0, 'ID', ID)

In [69]:
Y.head()

Unnamed: 0,ID,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060
4,10-1003 225,6,8,6
5,10-1008 221,7,7,7
6,10-1009 221,7,5,7
13,10-1020 232,7,7,5
14,10-1023A231,7,7,5


# Output data to file

In [70]:
# ouput numeric data
df_num.to_csv('data/data_numeric.csv', index=False)

In [71]:
# output categorical data
df_cat.to_csv('data/data_categorical.csv', index=False)

In [72]:
# output target varialbes
Y.to_csv('data/data_targets.csv', index=False)