# Summary
- Target categories:
    - Good (suff. rating >= 80%)
    - Fair (suff. rating < 80% and >= 50%)
    - Poor (suff. rating < 50%)

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
# tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# model save
from sklearn.externals import joblib

## other

In [3]:
# upsampling with SMOTE
from imblearn import over_sampling
from imblearn.pipeline import Pipeline as imbPipeline

# Plot Settings

In [4]:
# format style
plt.style.use('fivethirtyeight')
# lineweight
plt.rc('lines', linewidth=3)
# figure size
plt.rc('figure', figsize=(12, 7))
# title fontsize
plt.rc('axes', titlesize=33) 
# axes label fontsize
plt.rc('axes', labelsize=28)
# axes values fontsize
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
# legend fontsize
plt.rc('legend', fontsize=18)

# Custom Functions

In [5]:
def rat_conv(rating):
    '''
    assign sufficiency rating to category
    '''
    if rating < 50:
        return('poor')
    else:
        return ('not poor')

## Feature Weight Sorting

In [6]:
def feat_sort(values, labels, num_ret='all', ret_abs=False, ret_pct=False, ret_0=False):
    '''
    Return sorted (descending) feature weights
    
    Parameters
    ----------
    values : feature weight values from analysis
    labels : names of each feature
    num_ret : number of top features to return
    ret_abs : return absolute value of weight values (will track original sign)
    ret_pct : return each weight as percentage of total (based on absolute value)
    ret_0 : return weights with value of zero
    '''
    
    # create feature weight dataframe
    df = pd.DataFrame(values, index=labels, columns=['feat_wgt']).copy()
        
    # ret_abs check
    if ret_abs == True:
        # create column identifying positive weights
        df['positive'] = df['feat_wgt'] > 0
        # transform weights to absolute value
        df['feat_wgt'] = df['feat_wgt'].apply(abs)
        
    # sort weights (largest to smallest)
    df.sort_values(by='feat_wgt', ascending=False, inplace=True)
    
    # ret_pct check (rounded to whole number)
    if ret_pct == True:
        if ret_abs == False:
            df['feat_wgt'] = df['feat_wgt'].apply(abs)
            df.sort_values(by='feat_wgt', ascending=False, inplace=True)
        # transform weights to percentages
        df['feat_wgt'] = round(df['feat_wgt'] / df['feat_wgt'].sum() * 100, 0).astype(int)
        
    # ret_0 check
    if ret_0 == False:
        # drop weights == 0
        df = df[df['feat_wgt'] != 0]
        
    # return dataframe
    if num_ret == 'all':
        return(df)
    else:
        return(df.iloc[:num_ret, :])

# Data

In [7]:
df_id = pd.read_csv('data/bridges_id.csv', index_col=0)
df_id.head()

Unnamed: 0,COUNTY_CODE_003,FEATURES_DESC_006A,FACILITY_CARRIED_007,LOCATION_009
51-1VA0158,99.0,'Gambo Creek ','Tisdale Rd ','1km N of Dahlgren Rd '
51-1VA0575,740.0,'Dale St. & N&P RR ','Williams Avenue ','At Gate 36 and Elm Ave. '
51-1VA0591,810.0,'Drainage Canal ','Golf Cart Path ','Near 9th Hole '
51-1VA2106,810.0,'Lake Whitehurst Outlet ','Guam Road ','1 KM NW of Ferry Rd '
51-1VA2107,810.0,'Eastern Shore RR ','Amphibious Drive ','0.2 KM W of Abbott Rd '


In [8]:
df_num = pd.read_csv('data/bridges_num.csv', index_col=0)
df_num.head()

Unnamed: 0,MIN_VERT_CLR_010,DETOUR_KILOS_019,AGE,TRAFFIC_LANES_ON_028A,TRAFFIC_LANES_UND_028B,ADT_029,APPR_WIDTH_MT_032,DEGREES_SKEW_034,NAV_VERT_CLR_MT_039,NAV_HORR_CLR_MT_040,...,RIGHT_CURB_MT_050B,ROADWAY_WIDTH_MT_051,DECK_WIDTH_MT_052,VERT_CLR_OVER_MT_053,VERT_CLR_UND_054B,LAT_UND_MT_055B,LEFT_LAT_UND_MT_056,RECON_AGE,PERCENT_ADT_TRUCK_109,SUFFICIENCY_RATING_feat_yr
51-1VA0158,99.99,5.0,67.0,2.0,0,1650.0,7.3,0.0,0.0,0.0,...,0.3,5.5,6.0,99.99,0.0,0.0,0.0,20.0,5.0,13.5
51-1VA0575,99.99,1.0,65.0,2.0,2,5000.0,8.1,0.0,0.0,0.0,...,1.6,7.9,10.4,99.99,6.53,0.9,1.5,18.0,5.0,30.9
51-1VA0591,99.99,1.0,42.0,2.0,0,50.0,6.1,0.0,0.0,0.0,...,0.2,5.7,6.1,99.99,0.0,0.0,0.0,42.0,0.0,46.2
51-1VA2106,99.99,5.0,53.0,2.0,0,500.0,9.1,9.0,0.0,0.0,...,0.0,7.3,9.0,99.99,0.0,0.0,0.0,53.0,5.0,68.8
51-1VA2107,99.99,2.0,52.0,2.0,2,4000.0,7.9,0.0,0.0,0.0,...,0.6,7.9,10.2,99.99,6.98,7.4,6.9,52.0,5.0,48.9


In [9]:
df_cat = pd.read_csv('data/bridges_cat.csv', index_col=0)
df_cat.head()

Unnamed: 0,TOLL_020,MAINTENANCE_021,FUNCTIONAL_CLASS_026,DESIGN_LOAD_031,MEDIAN_CODE_033,STRUCTURE_FLARED_035,RAILINGS_036A,TRANSITIONS_036B,APPR_RAIL_036C,APPR_RAIL_END_036D,...,STRUCTURE_TYPE_043B,APPR_KIND_044A,APPR_TYPE_044B,DECK_STRUCTURE_TYPE_107,SURFACE_TYPE_108A,MEMBRANE_TYPE_108B,DECK_PROTECTION_108C,DECK_COND_058,SUPERSTRUCTURE_COND_059,SUBSTRUCTURE_COND_060
51-1VA0158,3.0,73.0,9.0,8.0,0.0,0.0,0,0,0,0,...,4,0.0,0.0,1,0,0,0,5,4,4
51-1VA0575,3.0,73.0,19.0,0.0,0.0,0.0,1,1,1,1,...,4,5.0,1.0,1,6,0,0,5,5,5
51-1VA0591,3.0,73.0,19.0,0.0,0.0,0.0,0,0,0,0,...,9,0.0,0.0,1,0,0,0,5,5,6
51-1VA2106,3.0,73.0,19.0,5.0,0.0,0.0,0,0,0,0,...,2,0.0,0.0,1,0,0,0,5,6,5
51-1VA2107,3.0,73.0,19.0,6.0,0.0,0.0,0,0,0,0,...,2,3.0,2.0,1,0,0,0,5,5,4


In [10]:
tar = pd.read_csv('data/bridges_tar.csv', index_col=0, header=None, names=['suff_rating'])
tar.head()

Unnamed: 0,suff_rating
51-1VA0158,13.0
51-1VA0575,7.0
51-1VA0591,48.0
51-1VA2106,79.5
51-1VA2107,63.4


In [11]:
# number of observations
len(tar)

7803

# Pre-processing

## Categorical Data

In [12]:
cat_enc = pd.get_dummies(df_cat.astype(str))
cat_enc.head()

Unnamed: 0,TOLL_020_1.0,TOLL_020_2.0,TOLL_020_3.0,MAINTENANCE_021_1.0,MAINTENANCE_021_2.0,MAINTENANCE_021_21.0,MAINTENANCE_021_26.0,MAINTENANCE_021_27.0,MAINTENANCE_021_3.0,MAINTENANCE_021_31.0,...,SUPERSTRUCTURE_COND_059_9,SUPERSTRUCTURE_COND_059_N,SUBSTRUCTURE_COND_060_3,SUBSTRUCTURE_COND_060_4,SUBSTRUCTURE_COND_060_5,SUBSTRUCTURE_COND_060_6,SUBSTRUCTURE_COND_060_7,SUBSTRUCTURE_COND_060_8,SUBSTRUCTURE_COND_060_9,SUBSTRUCTURE_COND_060_N
51-1VA0158,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
51-1VA0575,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
51-1VA0591,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
51-1VA2106,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
51-1VA2107,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## Combine Categorical and Numeric Data

In [13]:
# combine numeric and encoded categorical features on index
X = df_num.join(cat_enc)
X.head()

Unnamed: 0,MIN_VERT_CLR_010,DETOUR_KILOS_019,AGE,TRAFFIC_LANES_ON_028A,TRAFFIC_LANES_UND_028B,ADT_029,APPR_WIDTH_MT_032,DEGREES_SKEW_034,NAV_VERT_CLR_MT_039,NAV_HORR_CLR_MT_040,...,SUPERSTRUCTURE_COND_059_9,SUPERSTRUCTURE_COND_059_N,SUBSTRUCTURE_COND_060_3,SUBSTRUCTURE_COND_060_4,SUBSTRUCTURE_COND_060_5,SUBSTRUCTURE_COND_060_6,SUBSTRUCTURE_COND_060_7,SUBSTRUCTURE_COND_060_8,SUBSTRUCTURE_COND_060_9,SUBSTRUCTURE_COND_060_N
51-1VA0158,99.99,5.0,67.0,2.0,0,1650.0,7.3,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
51-1VA0575,99.99,1.0,65.0,2.0,2,5000.0,8.1,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
51-1VA0591,99.99,1.0,42.0,2.0,0,50.0,6.1,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
51-1VA2106,99.99,5.0,53.0,2.0,0,500.0,9.1,9.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
51-1VA2107,99.99,2.0,52.0,2.0,2,4000.0,7.9,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


## Target Data

In [14]:
# set target column values to categories
y = tar['suff_rating'].apply(rat_conv)

In [15]:
# number in each category
y.value_counts()

not poor    6938
poor         865
Name: suff_rating, dtype: int64

## Train/Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Initial Modeling

## Logistic Regression

In [17]:
# model pipeline includes scaling
log_pipe = Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('model', LogisticRegression(class_weight='balanced'))
])

In [18]:
log_pipe.fit(X_train, y_train)
log_pred = log_pipe.predict(X_test)

In [19]:
print(metrics.classification_report(y_test, log_pred))

             precision    recall  f1-score   support

   not poor       0.98      0.87      0.92      1375
       poor       0.49      0.88      0.63       186

avg / total       0.92      0.87      0.89      1561



In [20]:
metrics.confusion_matrix(y_test, log_pred)

array([[1201,  174],
       [  22,  164]])

In [21]:
log_pipe.classes_

array(['not poor', 'poor'], dtype=object)

In [22]:
log_imp = feat_sort(log_pipe.named_steps['model'].coef_[0], X_train.columns, ret_abs=True, ret_pct=True)
log_imp

Unnamed: 0,feat_wgt,positive
SUFFICIENCY_RATING_feat_yr,5,False
ROADWAY_WIDTH_MT_051,3,False
FUNCTIONAL_CLASS_026_1.0,3,False
APPR_RAIL_END_036D_N,2,False
SUPERSTRUCTURE_COND_059_9,2,False
FUNCTIONAL_CLASS_026_12.0,2,False
NAV_HORR_CLR_MT_040,2,True
MAINTENANCE_021_31.0,2,False
SUPERSTRUCTURE_COND_059_8,2,False
TRAFFIC_LANES_ON_028A,2,True


### Refine

In [23]:
# try range of values for C and penalty hyperparameters
hyperparams = {'model__penalty':['l1', 'l2'], 'model__C': np.geomspace(0.001, 100, 15)}

In [24]:
# tune model pipeline for recall
log_tune = GridSearchCV(log_pipe, hyperparams, cv=5)

In [25]:
log_tune.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'model__penalty': ['l1', 'l2'], 'model__C': array([1.00000e-03, 2.27585e-03, 5.17947e-03, 1.17877e-02, 2.68270e-02,
       6.10540e-02, 1.38950e-01, 3.16228e-01, 7.19686e-01, 1.63789e+00,
       3.72759e+00, 8.48343e+00, 1.93070e+01, 4.39397e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
log_tune_pred = log_tune.predict(X_test)

In [27]:
# assign best hyperparameter values from tuning
best_penalty = log_tune.best_params_['model__penalty']
best_C = log_tune.best_params_['model__C']

In [28]:
best_penalty

'l1'

In [29]:
best_C

0.0610540229658533

In [30]:
print(metrics.classification_report(y_test, log_tune_pred))

             precision    recall  f1-score   support

   not poor       0.98      0.88      0.93      1375
       poor       0.50      0.88      0.64       186

avg / total       0.92      0.88      0.89      1561



In [31]:
metrics.confusion_matrix(y_test, log_pred)

array([[1201,  174],
       [  22,  164]])

In [32]:
# model pipeline with optimal hyperparameters
log_pipe = Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('model', LogisticRegression(penalty=best_penalty, C=best_C, class_weight='balanced'))
])

In [33]:
log_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', LogisticRegression(C=0.0610540229658533, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [34]:
tune_feats = feat_sort(log_pipe.named_steps['model'].coef_[0], X_train.columns, ret_abs=True, ret_pct=True)
tune_feats

Unnamed: 0,feat_wgt,positive
SUFFICIENCY_RATING_feat_yr,18,False
SUPERSTRUCTURE_COND_059_8,4,False
OPEN_CLOSED_POSTED_041_P,4,True
SUPERSTRUCTURE_COND_059_5,3,True
SUPERSTRUCTURE_COND_059_6,3,True
ROADWAY_WIDTH_MT_051,3,False
FUNCTIONAL_CLASS_026_1.0,3,False
DECK_STRUCTURE_TYPE_107_8,3,True
DESIGN_LOAD_031_6.0,2,False
SUPERSTRUCTURE_COND_059_9,2,False


In [35]:
len(tune_feats)

50

In [36]:
feat_sort(log_pipe.named_steps['model'].coef_[0], X_train.columns)

Unnamed: 0,feat_wgt
OPEN_CLOSED_POSTED_041_P,0.305504
SUPERSTRUCTURE_COND_059_5,0.281085
SUPERSTRUCTURE_COND_059_6,0.272662
DECK_STRUCTURE_TYPE_107_8,0.228843
FUNCTIONAL_CLASS_026_14.0,0.169077
RECON_AGE,0.162608
DESIGN_LOAD_031_0.0,0.143798
SURFACE_TYPE_108A_6,0.139732
FUNCTIONAL_CLASS_026_16.0,0.133777
TRAFFIC_LANES_ON_028A,0.108075
