In [1]:
import psycopg2, psycopg2.extras, psycopg2.pool, pickle, json, matplotlib, os, sys, shapely
from multiprocessing import Pool, Manager; from collections import OrderedDict; from shapely.geometry import mapping, shape; from sqlalchemy import create_engine
import matplotlib.pyplot as plt; import numpy as np; import pandas as pd; import geopandas as gpd; import seaborn as sns
%matplotlib inline

# Latest Tile Statistics

This notebook queries the `latest-tile-data-v3` database to identify tiles with specific characteristics, such as:
- named road km
- total road km
- total buildings
- number of buildings with more than `building=yes`

This allows for further _intrinsic quality_ comparison between the histories of tiles that are currently quantitatively similar on the map.

In [5]:
#Create the engine for Pandas to query SQL with
engine = create_engine('postgresql://anderstj@127.0.0.1:5432/latest-tile-data-v3')

In [6]:
study_tiles = json.loads(open('study_tiles.json','r').read())
print("Found {0} tiles.".format(len(study_tiles)))

Found 9 tiles.


In [7]:
# Get study_tile stats
def get_tile_df(quadkey):
    q_s = """SELECT * FROM roads, buildings, geometry WHERE 
        roads.quadkey = buildings.quadkey AND 
        geometry.quadkey = roads.quadkey AND 
        roads.quadkey = '{0}'""".format(quadkey)
    df = pd.read_sql_query(q_s,con=engine)
    
    df['named_road_ratio'] = (df.named_edited_km + df.named_new_km) / df.total_km
    df['more_building_ratio'] = (df.new_buildings_more + df.edited_buildings_more) / df.total_buildings
    return df

for tile in study_tiles:
    print(tile['name'], end="...")
    tile['characteristics'] = get_tile_df(tile['quad'])

Port Au Prince, Haiti...Trisuli Bazar, Nepal...Tacloban...Kenema, Sierra Leone...Monrovia, Liberia...Kathmandu, Nepal...Heidelberg, Germany...London...Manhattan, NY...

In [8]:
def get_similar_tiles(tile, tolerance=0.01, ratio_tol=0.01):
    """Given a tile, find similar tiles in the _current database_ within specific thresholds."""
    df = tile['characteristics']
    min_roads = (df.total_km - df.total_km*tolerance).values[0]
    max_roads = (df.total_km + df.total_km*tolerance).values[0]
    min_buildings = (df.total_buildings - df.total_buildings*tolerance).values[0]
    max_buildings = (df.total_buildings + df.total_buildings*tolerance).values[0]
    min_roads_ratio = (df.named_road_ratio - df.named_road_ratio*ratio_tol).values[0]
    max_roads_ratio = (df.named_road_ratio + df.named_road_ratio*ratio_tol).values[0]    
    min_building_ratio = (df.more_building_ratio - df.more_building_ratio*ratio_tol).values[0]
    max_building_ratio = (df.more_building_ratio + df.more_building_ratio*ratio_tol).values[0]

    query_string = """SELECT * FROM roads, buildings, geometry WHERE roads.quadkey = buildings.quadkey AND geometry.quadkey = roads.quadkey AND 
    roads.total_km > {0} AND roads.total_km < {1} AND roads.quadkey != '{4}' AND 
    buildings.total_buildings > {2} AND buildings.total_buildings < {3} AND
    (roads.named_edited_km + roads.named_new_km)/roads.total_km > {5} AND 
    (roads.named_edited_km + roads.named_new_km)/roads.total_km < {6} AND
    (buildings.edited_buildings_more + buildings.new_buildings_more)::float / buildings.total_buildings::float > {7} AND
    (buildings.edited_buildings_more + buildings.new_buildings_more)::float / buildings.total_buildings::float < {8}
    """.format(min_roads, max_roads, min_buildings, max_buildings, df.quadkey.values[0][0], min_roads_ratio, max_roads_ratio, min_building_ratio, max_building_ratio)
    return pd.read_sql_query(query_string,con=engine)

# Test approach
x = get_similar_tiles(study_tiles[2], tolerance = 0.25, ratio_tol=0.25)
print(len(x))
x.head()

1


Unnamed: 0,quadkey,named_edited_km,named_new_km,total_edited_km,total_km,total_new_km,unnamed_edited_km,unnamed_new_km,quadkey.1,total_buildings,total_new_buildings,total_edited_buildings,new_buildings_more,new_buildings_yes,edited_buildings_more,edited_buildings_yes,quadkey.2,coordinates,type
0,132303033313,99.9365,3.75274,218.701,319.301,100.6,118.764,96.8474,132303033313,27667,3366,24301,1170,2196,16127,8174,132303033313,"[[[120.849609375, 14.093957177836227], [120.84...",Polygon


## Identify Similar Tiles in the Database

For each of the study tiles, how many other tiles exist with similar characteristics: 
 - Number of Buildings
 - km of roads
 - % of named roads
 - % of buildings with more than just `building==yes`

In [9]:
for tile in study_tiles:
    print(tile['name'])
    sim_tiles = get_similar_tiles(tile, tolerance=0.35, ratio_tol=0.35)
    print("Similar Tiles: {0}\n".format(len(sim_tiles)))
    tile['similar_tiles'] = sim_tiles

Port Au Prince, Haiti
Similar Tiles: 25

Trisuli Bazar, Nepal
Similar Tiles: 4

Tacloban
Similar Tiles: 2

Kenema, Sierra Leone
Similar Tiles: 1

Monrovia, Liberia
Similar Tiles: 1

Kathmandu, Nepal
Similar Tiles: 1

Heidelberg, Germany
Similar Tiles: 66

London
Similar Tiles: 32

Manhattan, NY
Similar Tiles: 16



In [10]:
def convert_to_gpd(df):
    """ Given a pandas dataframe with a `geometry` column, return a GeoDataFrame
        Note: This function creates the geometry from JSON string and then gets the envelope, this ensures proper winding order.
    """
    df['geometry'] = df.coordinates.apply(lambda coords: shape({"type":"Polygon", "coordinates":json.loads(coords)}).exterior.envelope)
    
    return gpd.GeoDataFrame(df)
y = convert_to_gpd(x)

### Write GeoJSON feature collections of similar tiles for each study tile

In [11]:
directory = '/data/www/jennings/iscram/'
if not os.path.exists(directory):
    os.makedirs(directory)
for tile in study_tiles:
    print(tile['name'])
    tile_gpd = convert_to_gpd(tile['similar_tiles'])
    filename = directory+"/"+tile['name']+"_sim_tiles.geojson"
    if os.path.exists(filename):
        os.remove(filename)
    try:
        tile_gpd.to_file(directory+"/"+tile['name']+"_sim_tiles.geojson", driver="GeoJSON")
    except:
        print("\tError. Length of dataframe: {0}".format(len(tile_gpd)))

Port Au Prince, Haiti
Trisuli Bazar, Nepal
Tacloban
Kenema, Sierra Leone
Monrovia, Liberia
Kathmandu, Nepal
Heidelberg, Germany
London
Manhattan, NY


### Create GeoJSON file for all features

In [12]:
feat_coll = {"type":"FeatureCollection","features":[]}
for tile in study_tiles:
    tile['characteristics']['name'] = tile['name']
    feat_coll['features'].append( json.loads(convert_to_gpd(tile['characteristics']).to_json())['features'][0] )

with open(directory+"study_tiles.geojson",'w') as oFile:
    json.dump(feat_coll, oFile)

### The study tiles can be [visualized here](http://www.townsendjennings.com/geojson-polygons?geojson=http://epic-analytics.cs.colorado.edu:9000/jennings/iscram/study_tiles.geojson#1.9/14.6/8.5)
(Must be on the CU network)

## Create Single JSON record with study tiles and comparable tiles

In [13]:
output = []
for tile in study_tiles: 
    sim_quads = [x[0] for x in tile['similar_tiles'].quadkey.values]
    output.append(
        {'name':tile['name'],
         'quad':tile['quad'],
         'compare_tiles': [{'quad':x} for x in sim_quads]})
with open("study_tiles_with_compare_tiles.json",'w') as oFile:
    json.dump(output,oFile)

In [18]:
print(study_tiles[4]['name'])
study_tiles[4]['characteristics']

Monrovia, Liberia


Unnamed: 0,quadkey,named_edited_km,named_new_km,total_edited_km,total_km,total_new_km,unnamed_edited_km,unnamed_new_km,quadkey.1,total_buildings,...,new_buildings_yes,edited_buildings_more,edited_buildings_yes,quadkey.2,coordinates,type,named_road_ratio,more_building_ratio,name,geometry
0,33330222101,54.9428,1.09467,127.642,174.068,46.4256,72.6993,45.331,33330222101,19193,...,17610,138,514,33330222101,"[[[-10.810546875, 6.227933930268673], [-10.810...",Polygon,0.321929,0.055697,"Monrovia, Liberia","POLYGON ((-10.810546875 6.227933930268673, -10..."
