# Note to Reviewer: 
Several feature engineering steps were completed throughout my data preprocessing and EDA steps. 
I've completed troubleshooting for previous open questions:
- why do all zctas have at least one flood? I made the choice early on to only include zctas with floods, but went back and added all. ZCTAs with no floods could add value to model.
- I added a new dataset, National hydrography dataset (ie streamlines) and calculated total channel lenght within each zcta.
- I also added a count of flood by season (fall, winter, and spring) to use in the model 

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [18]:
data = pd.read_csv('data/02_data.csv', index_col=0)

data.isna().sum()

med_houseinc_12mon              0
household_public_assistance1    0
state_fips                      0
nflood_total                    0
flood_count_fall                0
flood_count_winter              0
flood_count_spring              0
nflood_2000s                    0
nflood_2010s                    0
flood_dur_hours_median          0
flood_dur_hours_min             0
flood_dur_hours_max             0
zcta                            0
channel_length_km               0
delta_floods                    0
dtype: int64

In [20]:
# keep zcta but remove from modeling data
data.index = data['zcta']

# drop columns not used in model
data_medianinc = data.drop(['zcta', 'state_fips', 'household_public_assistance1'], axis=1)

data_medianinc


Unnamed: 0_level_0,med_houseinc_12mon,nflood_total,flood_count_fall,flood_count_winter,flood_count_spring,nflood_2000s,nflood_2010s,flood_dur_hours_median,flood_dur_hours_min,flood_dur_hours_max,channel_length_km,delta_floods
zcta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
35004,76683.0,1,0,0,0,1.0,0.0,1.967213,1.967213,1.967213,64.301,-1.000000
35005,55017.0,3,1,0,0,2.0,1.0,3.442623,1.967213,3.934426,94.617,-0.333333
35006,63521.0,2,0,1,0,0.0,1.0,3.196721,1.967213,4.426230,306.497,1.000000
35007,81351.0,6,1,0,1,1.0,4.0,2.459016,0.983607,8.852459,108.522,0.600000
35010,46730.0,5,0,0,1,2.0,2.0,1.721311,0.983607,8.360656,649.526,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
99919,64135.0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.000000,0.000,0.000000
99921,59219.0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.000000,0.000,0.000000
99922,53125.0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.000000,0.000,0.000000
99925,80313.0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.000000,0.000,0.000000


### Scale Data

In [28]:
# scale data

scaler = StandardScaler()

data_medianinc_scaled = scaler.fit_transform(data_medianinc)
data_medianinc_scaled = pd.DataFrame(data_medianinc_scaled, columns=data_medianinc.columns)
data_medianinc_scaled

Unnamed: 0,med_houseinc_12mon,nflood_total,flood_count_fall,flood_count_winter,flood_count_spring,nflood_2000s,nflood_2010s,flood_dur_hours_median,flood_dur_hours_min,flood_dur_hours_max,channel_length_km,delta_floods
0,0.108615,-0.360028,-0.330981,-0.308605,-0.444974,0.134352,-0.483166,-0.168947,-0.082832,-0.266465,-0.401614,-2.197254
1,-0.584614,-0.034571,0.593177,-0.308605,-0.444974,0.678484,-0.247488,-0.139759,-0.082832,-0.248153,-0.314405,-1.049363
2,-0.312519,-0.197300,-0.330981,0.284585,-0.444974,-0.409780,-0.247488,-0.144624,-0.082832,-0.243575,0.295099,1.246419
3,0.257973,0.453614,0.593177,-0.308605,-0.070396,0.134352,0.459544,-0.159218,-0.114528,-0.202373,-0.274406,0.557685
4,-0.849766,0.290886,-0.330981,-0.308605,-0.070396,0.678484,-0.011811,-0.173812,-0.114528,-0.206951,1.281872,-0.475417
...,...,...,...,...,...,...,...,...,...,...,...,...
30346,-0.292873,-0.522757,-0.330981,-0.308605,-0.444974,-0.409780,-0.483166,-0.207864,-0.146224,-0.284776,-0.586585,-0.475417
30347,-0.450166,-0.522757,-0.330981,-0.308605,-0.444974,-0.409780,-0.483166,-0.207864,-0.146224,-0.284776,-0.586585,-0.475417
30348,-0.645151,-0.522757,-0.330981,-0.308605,-0.444974,-0.409780,-0.483166,-0.207864,-0.146224,-0.284776,-0.586585,-0.475417
30349,0.224761,-0.522757,-0.330981,-0.308605,-0.444974,-0.409780,-0.483166,-0.207864,-0.146224,-0.284776,-0.586585,-0.475417


### Categorical data have already been removed in step 2 EDA. 

In [31]:
# split test/train data
X = data_medianinc_scaled.drop('med_houseinc_12mon', axis=1)
y = data_medianinc_scaled['med_houseinc_12mon']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=777)


(       nflood_total  flood_count_fall  flood_count_winter  flood_count_spring  \
 1988      -0.360028         -0.330981           -0.308605           -0.070396   
 11740      0.290886          0.593177            0.284585           -0.444974   
 23132     -0.522757         -0.330981           -0.308605           -0.444974   
 26037     -0.522757         -0.330981           -0.308605           -0.444974   
 14438     -0.197300         -0.330981           -0.308605            0.304182   
 ...             ...               ...                 ...                 ...   
 4017       1.918172          1.517335           -0.308605            1.802493   
 7767      -0.197300         -0.330981           -0.308605            0.304182   
 19366     -0.522757         -0.330981           -0.308605           -0.444974   
 15931     -0.197300         -0.330981           -0.308605            0.304182   
 15151      0.616343          2.441493            0.284585            0.678760   
 
        nflood

### Ready to begin modeling