In [1]:
import pandas as pd
import numpy as np

In [50]:
df = pd.read_csv('data/DE17.txt')

In [3]:
df.head()

Unnamed: 0,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,PLACE_CODE_004,...,DTL_TYPE_OF_IMP,SPECIAL_CODE,STEP_CODE,STATUS_WITH_10YR_RULE,SUFFICIENCY_ASTERC,SUFFICIENCY_RATING,STATUS_NO_10YR_RULE,CAT10,CAT23,CAT29
0,10,1001 279,1,4,1,279,0,1,3,34030,...,14.0,,,2,,69.2,2,F,6,322.26
1,10,1001A279,1,4,1,279,0,1,3,34030,...,,,,2,,77.0,2,F,6,64.6
2,10,1001B009,1,3,1,52,0,1,3,77580,...,11.0,,,0,,96.4,0,G,7,150.57
3,10,1002 232,1,4,1,232,0,1,3,61590,...,12.0,,,2,,59.6,2,F,5,576.15
4,10,1003 225,1,3,1,92,0,1,3,31950,...,,,,0,,95.4,0,F,6,846.56


# Approach
Predict structural evaluation value based on NBI parameters using linear regression.

# ID Parameters
- 008
    - struct number, 15-digit
- 007
    - facility carried, 18-digit
- 006A
    - feature intersected, 24-digit
- 016
    - latitude (XX deg XX min XX.XX sec), 8-digit
- 017
    - longitude (XXX deg XX min XX.XX sec), 9-digit
- 001
    - state code, 3-digit
- 003
    - county code, 3-digit
- 026
    - functional classification, 2-digit - rural/urban and use

# Independent variables
Filter dataset for desired features for analysis. Items:
- 021
    - maintenance responsibility - categorical
- 027
    - year, 4-digit - feature engineer age from year as numerical
- 029
    - ADT, 6-digit - use to feature engineer ADTT value
- 109
    - ADTT (%), 2-digit - feature engineer value as numerical
- 028A
    - lanes on struct, 2-digit - numerical
- 041
    - posting, 1-digit - categorical
- 043A
    - material, 1-digit - categorical
- 043B
    - design type, 1-digit - categorical
- 045
    - num spans, 3-digit - numerical
- 048
    - max span length (meters), 5-digit (1 decimal) - numerical
- 049
    - struct length (meters), 6-digit (1 decimal) - numerical
- 052
    - deck width (meters), 4-digit (1 decimal) - numerical
- 054B
    - min underclearance (meters), 4-digit (2 decimal) - numerical
- 061
    - channel, 1-digit - categorical
- 107
    - deck type, 1-digit - categorical
- 108A
    - wearing surface, 1-digit - categorical
- 108B
    - membrane, 1-digit - categorical
- 108C
    - deck protection, 1-digit - categorical

# Dependent variables
Bridge performance metrics. Items:
- 058
    - deck rating, 1-digit
- 059
    - super rating, 1-digit
- 060
    - sub rating, 1-digit

# EDA

## Filter Bridges
Filter observations for highway bridges. Items:
1. 043B
    - design type, 2-digits - filter out 19 (remove culverts)
2. 042A
    - type of service, 1-digit - filter out all except 1, 6, 7, 8 (keep highway, interchanges only)

In [4]:
df_filt = df[(df['SERVICE_ON_042A'] == 1) | (df['SERVICE_ON_042A'] == 6) | 
(df['SERVICE_ON_042A'] == 7) | (df['SERVICE_ON_042A'] == 8)]

In [5]:
df_filt = df_filt[df_filt['STRUCTURE_TYPE_043B'] != 19]

In [6]:
df_filt['DECK_COND_058'].unique()

array(['6', '7', '8', 'N', '5', '4', '9', '3'], dtype=object)

In [7]:
df_filt['DECK_COND_058'].value_counts()

7    244
6    143
5     44
8     25
N     12
9      5
4      4
3      1
Name: DECK_COND_058, dtype: int64

In [8]:
df_filt = df_filt[df_filt['DECK_COND_058'] != 'N']

In [9]:
df_filt['DECK_COND_058'].unique()

array(['6', '7', '8', '5', '4', '9', '3'], dtype=object)

In [10]:
df_filt['SUPERSTRUCTURE_COND_059'].unique()

array(['8', '7', '5', '6', '4', '9'], dtype=object)

In [11]:
df_filt['SUBSTRUCTURE_COND_060'].unique()

array(['6', '7', '5', '4', '9', '8'], dtype=object)

## Filter Features

In [12]:
col_id = [
    'STRUCTURE_NUMBER_008',
    'FACILITY_CARRIED_007',
    'FEATURES_DESC_006A',
    'LAT_016',
    'LONG_017',
    'STATE_CODE_001',
    'COUNTY_CODE_003',
    'FUNCTIONAL_CLASS_026'
]

In [13]:
col_indep = [
    'MAINTENANCE_021',
    'YEAR_BUILT_027',
    'ADT_029',
    'PERCENT_ADT_TRUCK_109',
    'TRAFFIC_LANES_ON_028A',
    'OPEN_CLOSED_POSTED_041',
    'STRUCTURE_KIND_043A', 
    'STRUCTURE_TYPE_043B',
    'MAIN_UNIT_SPANS_045',
    'MAX_SPAN_LEN_MT_048',
    'STRUCTURE_LEN_MT_049',
    'DECK_WIDTH_MT_052',
    'VERT_CLR_UND_054B',
    'CHANNEL_COND_061',
    'DECK_STRUCTURE_TYPE_107', 
    'SURFACE_TYPE_108A', 
    'MEMBRANE_TYPE_108B',
    'DECK_PROTECTION_108C'
]

In [14]:
col_dep = [
    'DECK_COND_058', 
    'SUPERSTRUCTURE_COND_059', 
    'SUBSTRUCTURE_COND_060'
]

In [15]:
cols = col_id + col_indep + col_dep

In [16]:
df_filt = df_filt[cols]

In [17]:
df_filt.shape

(466, 29)

In [18]:
df_filt.head()

Unnamed: 0,STRUCTURE_NUMBER_008,FACILITY_CARRIED_007,FEATURES_DESC_006A,LAT_016,LONG_017,STATE_CODE_001,COUNTY_CODE_003,FUNCTIONAL_CLASS_026,MAINTENANCE_021,YEAR_BUILT_027,...,DECK_WIDTH_MT_052,VERT_CLR_UND_054B,CHANNEL_COND_061,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060
4,1003 225,'THOMPSON BRIDGE RD','BRANDYWINE CREEK ',39490118,75341160,10,3,7,1,1990,...,14.3,0.0,6,1,1,0,1,6,8,6
5,1008 221,'BRANDYWINE CREEK R','TRIB. TO BRANDYWINE ',39500587,75343303,10,3,9,1,1960,...,9.1,0.0,6,1,6,0,0,7,7,7
6,1009 221,'SMITH BRIDGE RD ','BRANDYWINE CREEK ',39501584,75344477,10,3,19,1,1839,...,5.2,0.0,7,8,7,0,N,7,5,7
13,1020 232,'ROCKLAND RD ','HUSBANDS RUN ',39471013,75334227,10,3,17,1,1932,...,8.3,0.0,7,1,6,0,1,7,7,5
14,1023A231,'GUYENCOURT RD. ','READING RR ',39485945,75350660,10,3,9,1,1918,...,6.3,6.59,N,8,7,0,0,7,7,5


# Finalize Model Data

## Numerical Values

In [19]:
col_num = [
    'LAT_016',
    'LONG_017',
    'YEAR_BUILT_027',
    'ADT_029',
    'PERCENT_ADT_TRUCK_109',
    'TRAFFIC_LANES_ON_028A',
    'MAIN_UNIT_SPANS_045',
    'MAX_SPAN_LEN_MT_048',
    'STRUCTURE_LEN_MT_049',
    'DECK_WIDTH_MT_052',
    'VERT_CLR_UND_054B',
    'DECK_COND_058', 
    'SUPERSTRUCTURE_COND_059', 
    'SUBSTRUCTURE_COND_060'
]

In [23]:
for col in col_num:
    print(df_filt[col].dtype)

int64
int64
int64
int64
float64
int64
int64
float64
float64
float64
float64
int64
int64
int64


In [22]:
# convert object types to int
df_filt['DECK_COND_058'] = pd.to_numeric(df_filt['DECK_COND_058'])
df_filt['SUPERSTRUCTURE_COND_059'] = pd.to_numeric(df_filt['SUPERSTRUCTURE_COND_059'])
df_filt['SUBSTRUCTURE_COND_060'] = pd.to_numeric(df_filt['SUBSTRUCTURE_COND_060'])

## Create ID, X and y dataframes

In [58]:
df_filt.head()

Unnamed: 0,STRUCTURE_NUMBER_008,FACILITY_CARRIED_007,FEATURES_DESC_006A,LAT_016,LONG_017,STATE_CODE_001,COUNTY_CODE_003,FUNCTIONAL_CLASS_026,MAINTENANCE_021,YEAR_BUILT_027,...,DECK_WIDTH_MT_052,VERT_CLR_UND_054B,CHANNEL_COND_061,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060
4,1003 225,'THOMPSON BRIDGE RD','BRANDYWINE CREEK ',39490118,75341160,10,3,7,1,1990,...,14.3,0.0,6,1,1,0,1,6,8,6
5,1008 221,'BRANDYWINE CREEK R','TRIB. TO BRANDYWINE ',39500587,75343303,10,3,9,1,1960,...,9.1,0.0,6,1,6,0,0,7,7,7
6,1009 221,'SMITH BRIDGE RD ','BRANDYWINE CREEK ',39501584,75344477,10,3,19,1,1839,...,5.2,0.0,7,8,7,0,N,7,5,7
13,1020 232,'ROCKLAND RD ','HUSBANDS RUN ',39471013,75334227,10,3,17,1,1932,...,8.3,0.0,7,1,6,0,1,7,7,5
14,1023A231,'GUYENCOURT RD. ','READING RR ',39485945,75350660,10,3,9,1,1918,...,6.3,6.59,N,8,7,0,0,7,7,5


In [60]:
# define values to identify each observation
ID = df_filt[[
    'STRUCTURE_NUMBER_008',
    'FACILITY_CARRIED_007',
    'FEATURES_DESC_006A',
    'LAT_016',
    'LONG_017',
    'STATE_CODE_001',
    'COUNTY_CODE_003',
    'FUNCTIONAL_CLASS_026'
]]

In [84]:
# define parameters for feature matrix
X_init = df_filt[[
    'YEAR_BUILT_027',
    'ADT_029',
    'PERCENT_ADT_TRUCK_109',
    'TRAFFIC_LANES_ON_028A',
    'MAIN_UNIT_SPANS_045',
    'MAX_SPAN_LEN_MT_048',
    'STRUCTURE_LEN_MT_049',
    'DECK_WIDTH_MT_052',
    'VERT_CLR_UND_054B',
]]

In [62]:
# define array of structure condition rating for target value
y = df_filt['DECK_COND_058']

## Feature Engineering


### Age

In [63]:
year_base = 2018

In [85]:
X_init['YEAR_BUILT_027'] = year_base - X_init['YEAR_BUILT_027']
X_init.rename(columns={'YEAR_BUILT_027': 'AGE'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


### ADTT

In [86]:
X_init['PERCENT_ADT_TRUCK_109'] = X_init['PERCENT_ADT_TRUCK_109'] * X_init['ADT_029']
X_init.rename(columns={'PERCENT_ADT_TRUCK_109': 'ADTT'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


### Location

## Categorical Values

- 021
    - maintenance responsibility - categorical
- 041
    - posting, 1-digit - categorical
- 043A
    - material, 1-digit - categorical
- 043B
    - design type, 1-digit - categorical
- 061
    - channel, 1-digit - categorical
- 107
    - deck type, 1-digit - categorical
- 108A
    - wearing surface, 1-digit - categorical
- 108B
    - membrane, 1-digit - categorical
- 108C
    - deck protection, 1-digit - categorical

In [41]:
col_cat = [
    'MAINTENANCE_021',
    'OPEN_CLOSED_POSTED_041',
    'STRUCTURE_KIND_043A', 
    'STRUCTURE_TYPE_043B',
    'CHANNEL_COND_061',
    'DECK_STRUCTURE_TYPE_107', 
    'SURFACE_TYPE_108A', 
    'MEMBRANE_TYPE_108B',
    'DECK_PROTECTION_108C'
]

In [87]:
for col in col_cat:
    print(col)
    print(df_filt[col].unique())

MAINTENANCE_021
[ 1 11  4 32 70]
OPEN_CLOSED_POSTED_041
['A' 'P' 'K']
STRUCTURE_KIND_043A
[5 3 4 1 7 6 2 0]
STRUCTURE_TYPE_043B
[ 5  2  1  6  3 10  4 13 15 14 17 12]
CHANNEL_COND_061
['6' '7' 'N' '8' '4' '5' '9']
DECK_STRUCTURE_TYPE_107
['1' '8' '9' '2' 'N' '3']
SURFACE_TYPE_108A
['1' '6' '7' '0' '5' '3' '2' '9' 'N']
MEMBRANE_TYPE_108B
['0' '9' '3' 'N' '1' '2']
DECK_PROTECTION_108C
['1' '0' 'N' '3' '9' '2']


In [88]:
df_cat = df_filt[col_cat]

In [94]:
df_cat.head()

Unnamed: 0,MAINTENANCE_021,OPEN_CLOSED_POSTED_041,STRUCTURE_KIND_043A,STRUCTURE_TYPE_043B,CHANNEL_COND_061,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C
4,1,A,5,5,6,1,1,0,1
5,1,A,3,2,6,1,6,0,0
6,1,A,4,2,7,8,7,0,N
13,1,A,1,1,7,1,6,0,1
14,1,P,7,2,N,8,7,0,0


In [126]:
for col in col_cat:
    cat_df = pd.get_dummies(df_cat[col], prefix=col)
    X_init = X_init.join(cat_df)

In [127]:
X_init.head()

Unnamed: 0,AGE,ADT_029,ADTT,TRAFFIC_LANES_ON_028A,MAIN_UNIT_SPANS_045,MAX_SPAN_LEN_MT_048,STRUCTURE_LEN_MT_049,DECK_WIDTH_MT_052,VERT_CLR_UND_054B,MAINTENANCE_021_1,...,MEMBRANE_TYPE_108B_2,MEMBRANE_TYPE_108B_3,MEMBRANE_TYPE_108B_9,MEMBRANE_TYPE_108B_N,DECK_PROTECTION_108C_0,DECK_PROTECTION_108C_1,DECK_PROTECTION_108C_2,DECK_PROTECTION_108C_3,DECK_PROTECTION_108C_9,DECK_PROTECTION_108C_N
4,28,6814,34070.0,2,2,28.4,59.2,14.3,0.0,1,...,0,0,0,0,0,1,0,0,0,0
5,58,2473,22257.0,2,1,12.5,13.3,9.1,0.0,1,...,0,0,0,0,1,0,0,0,0,0
6,179,2029,22319.0,1,3,13.7,43.6,5.2,0.0,1,...,0,0,0,0,0,0,0,0,0,1
13,86,3712,22272.0,2,1,7.7,8.4,8.3,0.0,1,...,0,0,0,0,0,1,0,0,0,0
14,100,364,4368.0,2,3,7.3,20.4,6.3,6.59,1,...,0,0,0,0,1,0,0,0,0,0


## Scaling