# GSOD ETL Harvester Framework - Downoad US County Geographic Extents (polygons)

This notebook be used to download geographic extans for covid-fusion and store them locally, 
The geographic extends include:
  * US county boundries - 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
  * US state boundries

https://www.ncei.noaa.gov/pub/data/noaa/isd-history.csv

## GeoJson

We will be using geojson files to manage our boundaries. https://geojson.org/

# Changelog / To-Do  

 * **2020-04-09**: extracted file 
 * **2020-04-11**: updloaded data to mongo

**To-do**

* Need to be able to see if a point is in the extent

In [26]:
import requests
from bs4 import BeautifulSoup
import csv
import os.path
from os import path
import logging
import sys
# import pymongo
import pandas as pd
import json
from datetime import datetime
from pymongo import MongoClient, GEOSPHERE
from pymongo.errors import (PyMongoError, BulkWriteError)

geojson_counties_url = "https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json"
geojson_data_dir = "../data/interim/geojson/"
geojson_counties_file = "geojson-counties-fips.json"
geojson_mongo_collection_name = "covid-fusion-geojson-counties-data"

########################################################    
#
def setup_custom_logger(name):
    formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(funcName)s - %(module)s - %(message)s')
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    return logger

########################################################    
#
def initialize_etl_harvester():
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)

    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(module)s %(funcName)s %(message)s',
                        handlers=[stream_handler])        
    log = logging.getLogger()
# log.setLevel(logging.DEBUG)

########################################################    
#
def shutdown_etl_harvester():
    # remember to close the handlers
    for handler in logger.handlers:
        handler.close()

########################################################    
#
def get_file_from_url(url, ext='', params={}):
    response = requests.get(url, params=params)
    if response.ok:
        file_data = response.content
#        logger.info("this is the file")
#        logger.info(file_data)
        return file_data
    else:
        return response.raise_for_status()
        
########################################################    
#
def get_geojson_counties_file(url, ext, pathname):
    logger.info('get_geojson_counties_file: Extracting geojosn counties file from: ' + url)
    file_data_result = get_file_from_url(url, ext)
#    logger.info(file_data_result)
    with open(pathname, 'wb') as f:
        f.write(file_data_result)
    
########################################################    
#
def geojson_counties_file_exists(pathname):
    if path.exists(pathname):
         return True
    else:
         return False

########################################################    
#
# load the counties into mongo for analysis
def load_geojson_counties_file(pathname):

    with open(pathname, 'r') as f:
        counties_geojson = json.loads(f.read())
    
    mg_client = pymongo.MongoClient()
    mg_db = mg_client['covid_fusion']
    mg_col = mg_db['geojson_counties']
    
    mg_col.create_index([("geometry", GEOSPHERE)])
    bulk = mg_col.initialize_unordered_bulk_op()
    
    for feature in counties_geojson['features']: 
      # Note: comment out next two lines if input file does not contain timestamp field having proper format
      # timestamp = feature['properties']['timestamp']
      # feature['properties']['timestamp'] = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')

      # append to bulk insert list
      bulk.insert(feature)

    # execute bulk operation to the DB
    try:
        result = bulk.execute()
        logger.info('Number of Features successully inserted: %d', result["nInserted"])
    except BulkWriteError as ex:
        nInserted = ex.details["nInserted"]
        errMsg = ex.details["writeErrors"]
        logger.error('Errors encountered inserting features to mongo')
        logger.error('Number of features successfully inserted: %d', nInserted)
        logger.error('The following errors were found:')
        for item in errMsg:
            logger.error('Index of feature: %d', item["index"])
            logger.error('Error code: %d', item["code"])
            logger.error('Message (truncated due to data length:)' + item["errmsg"][0:120] + '...')
            
########################################################    
#
# Geographic extents ETL Harvester POC
#
# db.geojson_counties.find( { location: { $near: { $geometry: { type: "Point", coordinates: [ -73.9667, 40.78 ] }, $minDistance: 1000, $maxDistance: 5000 } } } )
#  
########################################################
logger = setup_custom_logger('GeoJson-ETL-Counties-Logger')
initialize_etl_harvester()
logger.info('GeoJson ETL - Started')

geojson_counties_file_pathname = geojson_data_dir + geojson_counties_file

# The filelist is extracted from 
# get file list, if it doesn't exist
if geojson_counties_file_exists(geojson_counties_file_pathname) == False:
    get_geojson_counties_file(geojson_counties_url, "json", geojson_counties_file_pathname)
    load_geojson_counties_file(geojson_counties_file_pathname)
else:
    logger.info('file exists: ' + geojson_counties_file_pathname)
    # TODO: put a check to see if already loaded? check to see if collection exists, and is non-zero
    load_geojson_counties_file(geojson_counties_file_pathname)
    
logger.info('GeoJson ETL - Ended')

shutdown_etl_harvester()


2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
2021-07-03 10:17:28,069 - INFO - <module> - <ipython-input-26-7f05903a9d86> - GeoJson ETL - Started
