<a href="https://colab.research.google.com/github/ipeirotis/datasets/blob/master/Insert_Shapefiles_to_DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo pip3 install -U pip  geoalchemy2

Collecting pip
  Downloading pip-20.2.2-py2.py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 11.2 MB/s eta 0:00:01
[?25hRequirement already up-to-date: geoalchemy2 in /usr/local/lib/python3.6/dist-packages (0.8.4)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.2.1
    Uninstalling pip-20.2.1:
      Successfully uninstalled pip-20.2.1
Successfully installed pip-20.2.2


In [None]:
# Imports
import sqlalchemy
from geoalchemy2 import Geometry, WKTElement

In [None]:
# Magic GIS number, needs to be 4326
srid = 4326

In [None]:
# This code creates a connection to the database
from sqlalchemy import create_engine

conn_string = 'postgresql://{user}:{password}@{host}/{db}'.format(
    host = '35.229.99.188', 
    user = 'panos',
    db = 'nyu',
    password = 'NEM-soo5thon'
)

engine = create_engine(conn_string)
con = engine.connect()

OperationalError: (psycopg2.OperationalError) could not connect to server: Connection timed out
	Is the server running on host "35.229.99.188" and accepting
	TCP/IP connections on port 5432?

(Background on this error at: http://sqlalche.me/e/13/e3q8)

In [None]:
conn_string

In [None]:
import boto3


class S3:
    """
    Minor utility class for upload/download/delete files on S3 bucket
    """

    def __init__(self, profile_name='ai', bucket_name="ai-contact-intelligence-data-and-models"):
        self.session = boto3.Session(profile_name=profile_name)
        self.s3 = self.session.client('s3', "us-east-1")
        self.bucket_name = bucket_name

    def upload(self, local_filename, remote_filename):
        with open(local_filename, "rb") as f:
            self.s3.upload_fileobj(f, self.bucket_name, remote_filename)

    def download(self, remote_filename):
        file = self.s3.get_object(Bucket=self.bucket_name, Key=remote_filename)
        return file['Body'].read()

    def delete(self, remote_filename):
        self.s3.delete_object(Bucket=self.bucket_name, Key=remote_filename)

    def file_exists(self, remote_filename):
        results = self.s3.list_objects_v2(
            Bucket=self.bucket_name, Prefix=remote_filename)
        if 'Contents' in results:
            s3_files = [f['Key'] for f in results['Contents']]
        else:
            s3_files = []
        return remote_filename in s3_files


In [None]:
s3bucket = S3(bucket_name="ai-contact-intelligence-data-and-models")

In [None]:
import geopandas as gpd
import fiona

In [None]:
def load_shapefile( path, filename, internal_filename):
    '''
    Utility function that loads a shapefile from an S3 bucket, and
    returns back the geodataframe.
    
    Note that a zip file may contain many different shapefiles internally.
    '''
    shapefile_bytes = s3bucket.download(f"{path}/{filename}.zip")
    zip_memory_file = fiona.io.ZipMemoryFile(shapefile_bytes)
    collection = zip_memory_file.open(f'{internal_filename}.shp')
    gdf = gpd.GeoDataFrame.from_features(collection, crs=collection.crs)
    return gdf

In [None]:
def load_buildings(state, geo_query=None):
    buildings = load_shapefile("maps/openstreetmap", f"{state}-latest-free.shp", "gis_osm_buildings_a_free_1")
    
    if geo_query: buildings = buildings[buildings.intersects(geo_query)]    
    
    return buildings[ ['osm_id', 'geometry'] ]

In [None]:
%%time
# Load building shapefiles for NY State
nys_buildings = load_buildings("new-york")

In [None]:
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon

nys_buildings["geometry"] = [MultiPolygon([feature]) if type(feature) == Polygon \
    else feature for feature in nys_buildings["geometry"]]

In [None]:
# Not needed, but want to check if retrieving based on centroids is faster
nys_buildings['centroid'] = nys_buildings['geometry'].centroid

In [None]:
# Use GeoAlchemy's WKTElement to create a geom with SRID
def create_wkt_element(geom):
    return WKTElement(geom.wkt, srid = srid)

In [None]:
# Convert `'geom'` column in GeoDataFrame `gdf` to hex
# Note that following this step, the GeoDataFrame is just a regular DataFrame
# because it does not have a geometry column anymore. Also note that
# it is assumed the `'geom'` column is correctly datatyped.
nys_buildings['centroid'] = nys_buildings['centroid'].apply(create_wkt_element)
nys_buildings['geometry'] = nys_buildings['geometry'].apply(create_wkt_element)

In [None]:
len(nys_buildings)

In [None]:
# Create a table
# See http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html for the documentation
from tqdm import tqdm
batchsize = 1000
batches = len(nys_buildings) // batchsize + 1

t = tqdm(range(batches))

for i in t:
    # print("Batch:",i)
    # continue # Cannot execute this on Travis
    start = batchsize * i
    end = batchsize * (i+1)
    nys_buildings[start:end].to_sql(name = "NYS_Buildings", 
          schema = 'public', 
          con = engine, 
          if_exists = 'append', 
          index = False,
          chunksize = 100,
          dtype= {
              'osm': sqlalchemy.types.VARCHAR(50),
              'name': sqlalchemy.types.VARCHAR(250),
              'type': sqlalchemy.types.VARCHAR(50),
              'centroid': Geometry('POINT', srid=srid),
              'geometry': Geometry('MULTIPOLYGON', srid=srid),
          }
    )

## Testing/benchmarking query execution times

In [None]:
neighborhoods = load_shapefile("maps", "us_neighborhoods", "us_neighborhoods")

In [None]:
neighborhoods.query(f"CITY=='{city}' and STATE=='{state_acronym}' and NAME=='West Village'")

In [None]:
geoid = 'nyc' # for querying Redshift
state = 'new-york' # for openstreetmaps
countyfp = '36061' # Manhattan, for census blocks and limiting the Redshift query
city = 'New York City-Manhattan' # for querying the neighborhoods shapefiles
state_acronym = 'NY' # for querying the neighborhoods shapefile
neighborhood = 'Upper West Side'
metric_name = 'current_ppsf'

# Area of focus
geo_query = neighborhoods.query(f"CITY=='{city}' and STATE=='{state_acronym}' and NAME=='{neighborhood}'").dissolve(by='CITY')

# geo_query = neighborhoods.query(f"CITY=='{city}' and STATE=='{state_acronym}'").dissolve(by='CITY')

geo_query = geo_query.geometry[0]

In [None]:
query

In [None]:
sql = f"""
SELECT
   geometry
FROM
    nyu.public."NYS_Buildings"
WHERE ST_MakeEnvelope{query} ~ geometry 
"""

sql

In [None]:
%%time
df = gpd.GeoDataFrame.from_postgis(sql, con=engine, geom_col='geometry' )

In [None]:
len(df)

In [None]:
df.to_file("manhattan.geojson", driver='GeoJSON')

with open("manhattan.geojson","r") as f:
    print(len(f.read()))

In [None]:
df.dtypes

In [None]:
import sys
sys.getsizeof(df.iloc[2])