# Indiana_ML Data Collection 
#### Using open spatial tools and mulitprocessing 

In [None]:
#########################################################################
#Data Collection 
#
# Functions for handling subwatershed characteristics data collection
#
# Authors: Shannon McAvoy (smcavoy@dewberry.com)
#
# Editors: Jason Matney (jmatney@dewberry.com)
#          Bakinam Essawy (bessawy@dewberry.com) 
#
# Copyright: Dewberry Engineers Inc.
#########################################################################

### Libraries

In [4]:
from functools import partial
import multiprocessing as mp
from multiprocessing import Pool, cpu_count
from functions import *
import geopandas as gpd
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import os
from skimage import io
from time import time
import fiona

### Setting up workspace

In [19]:
path = r"P:\Temp\Bakinam_Essawy\ML_Project\Indiana_Risk\forBakinam"
hucs = r"P:\Temp\Bakinam_Essawy\ML_Project\Indiana_Risk\forBakinam"
path_2 = r"P:\Temp\McAvoy\ML_DataCollection"
path_3 = r"P:\Temp\McAvoy\ML_DataCollection\Marion_County_Data\NLCD_Impervious\NLCD_2016_Impervious_L48_20190405_PERCENT"
# Set Local Variables
output_workspace = os.path.join(path, "working_dsn")
# output_workspace

#### Input data

In [None]:

##### change these based on what subwatersheds you want #####
# Read any shapefile in using multiprocessing 

all_subwatersheds = 'all_subwatersheds' 
all_centroids = 'all_centroids'
streams = 'streams'
water_bodies = 'water_bodies'
dams = 'dams'
bridges = 'bridges'
railroads = 'railroads'
population = 'population'
median_income = 'median_income'
county_boundary = 'county_boundary'
lu_usa = 'lu_usa'
dem_Indiana = 'dem_Indiana'
nfhl_sfha = 'nfhl_sfha'
dem_slope = 'dem_slope'
streets ='streets'
building_footprints = 'building_footprints'
# Shapefiles

data = [
    (all_subwatersheds,os.path.join(hucs, "wbdhu12_a_IN_only.shp")), # set source for shapefile with all 11 subwatersheds in the county
    (all_centroids, os.path.join(hucs, "wbdhu12_a_IN_only_centroids.shp")), # All centroids of subwatersheds
    (streams, os.path.join(path, "Hydrography_HighRes_FlowLine_NHD_USGS.shp")), #set source for streams data
    (water_bodies, os.path.join(path_2, "Marion_County_Data\IndianaMAP_WaterBodies\Water_Bodies_Lakes_LocalRes\Hydrography_LocalRes_WaterbodyDiscrete_NHD_IN.shp")),#set source for water bodies data 
    (dams, os.path.join(path_2, "Marion_County_Data\IndianaMAP_Dams\Dams_IDNR\Dams_IDNR_IN.shp")),#set source for dams data
    (bridges, os.path.join(path_2, "Marion_County_Data\IndianaMAP_Bridges\Bridges_County_INDOT\Bridges_County_INDOT_IN.shp")), #set source for bridges data
    (railroads, os.path.join(path_2, "Marion_County_Data\IndianaMAP_Railroads\Railroads_Active_Abandoned_INDOT\Rail_System_Active_Abandoned_INDOT_IN.shp")), #set source for railraods data
    (population, os.path.join(path_2, "Marion_County_Data\Indiana_PopulationData\Indiana_PopulationData.shp")), #set source for ACS population data
    (median_income, os.path.join(path_2,"Marion_County_Data\Indiana_IncomeData\Indiana_IncomeData.shp")), #set source for ACS median income data 
    (county_boundary, os.path.join(path_2, "Marion_County_Data\MarionCounty_Boundary\Marion_County_Boundary.shp")), #set source for county boundary data
    (lu_usa, os.path.join(path_2r"Marion_County_Data\NLCD_Impervious\NLCD_indiana_polygon.shp")) #set source for nlcd land use data 
    
]

# Tif data
#set source for dem
tif_data = [(dem_Indiana, os.path.join(path, "IN_EXTENT_MOSAIC\IN_EXTENT_MOSAIC.tif"))]

# Shapefiles within Geodatabase
#set source for NHFL Data
geodatabase_layers = [(nfhl_sfha, os.path.join(path_3, r"MeridianHills\MeridianHills_DataCollection\NFHL_18_20200310.gdb"), 'FileGDB','S_FLD_HAZ_AR'),
                      (streets, os.path.join(path, 'IndianaMAP_Streets\Streets_Centerlines_IGIO\County_Street_Centerlines_IGIO_IN.gdb', 'FileGDB','County_Street_Centerlines_IGIO_IN_Dec2019')]#set source for streets data


#source for Bing building footprints for Indiana (attrubuted with open street maps data)
                     
building_footprints_list =[(building_footprints, os.path.join(path, "BuildingFootprints_Indiana\Building_Footprints_Attributed_IN.shp"))]


In [7]:
#folder with partial duration files, set as workspace temporarily, then reset when done
directory_rainfall = os.path.join(path, "MarionCounty_Rainfall\All_Rainfall_Clipped_IN")

### Using multi-processing to load the data

In [None]:
func_dict = {load_file:data, 
             load_gdb_file:geodatabase_layers, 
             load_tif_file:tif_data, 
             shp_file_with_null:building_footprints_list}
for f, d in func_dict.items():
    st =time()
    with Pool(int(cpu_count()*2)) as p:
        x = dict(p.map(f, d))
    print(round(((time()-st)/60), 2) , 'minutes to process.')        

#### Load Raster files

In [23]:
#set source for slope dem
dem_slope = rasterio.open(os.path.join(path, 'in_ext_slope'))

#set source for slope dem

impervious_usa = [(os.path.join(path_3, 'NLCD_2016_Impervious_L48_20190405.img'))]


In [2]:
# x[all_subwatersheds]

#### Empty List

In [None]:

subwatershed_list = []
area_list = []
perimeter_list = []
watershed_length_list = []
elongation_ratio_list = []
shape_factor_list = []
circulatory_ratio_list = []
relief_list = []
relief_ratio_list = []
avg_slope_list = []
drainage_density_list = []
ruggedness_list = []
aae_list = []
buildings_aae_list = []
x_list = []
buildings_x_list = []
water_bodies_list = []
dams_list = []
bridges_list = []
streets_list = []
railroads_list = []
population_list = []
dependent_population_list = []
population_density_list = []
avg_median_income_list = []
housing_density_list = []
population_change_list = []
dist_to_stream_avg_list = []
dist_to_stream_stdev_list = []

lu_21_list = []
lu_22_list = []
lu_23_list = []
lu_24_list = []
lu_41_list = []
lu_82_list = []
impervious_percent_list = []

orb100yr06h_list = []
orb100yr12h_list = []
orb100yr24h_list = []
orb25yr06h_list = []
orb25yr12h_list = []
orb25yr24h_list = []
orb2yr06h_list = []
orb2yr12h_list = []
orb2yr24h_list = []
orb50yr06h_list = []
orb50yr12h_list = []
orb50yr24h_list = []
orb100yr06ha_am_list = []
orb100yr12ha_am_list = []
orb100yr24ha_am_list = []
orb25yr06ha_am_list = []
orb25yr12ha_am_list = []
orb25yr24ha_am_list = []
orb2yr06ha_am_list = []
orb2yr12ha_am_list = []
orb2yr24ha_am_list = []
orb50yr06ha_am_list = []
orb50yr12ha_am_list = []
orb50yr24ha_am_list = []


In [None]:
# GLOBAL VARIABLES
LOG_NAME = 'DataCollection_export.log'