# Import and Load Data

In [1]:
import os
import json
import cv2
import numpy as np

import shapely
from shapely.geometry import Polygon, Point

import pandas as pd
import geopandas as gpd
# import geoplot




In [2]:
#retrieve the marikina buildings data with their corresponding address
marikina_buildings_data  = gpd.read_file('marikina_polygons_with_address.geojson')

#keep only the necessary columns 
marikina_df = marikina_buildings_data[['id','geometry', 'address']]

#Only keep the building and remove the polygons with type Point
marikina_df = marikina_df[marikina_df['geometry'].apply(lambda x : x.type!='Point' )]

In [6]:
# Retrieve the predicted polygon results from the model
polygon_results = gpd.read_file('polygon_results.geojson')

# create an identifier for each polygon, use its index as its id named segment id
polygon_results['segment_id'] = polygon_results.index

# Overlay Removing rooftop segment duplicates (keep only match with highest intersection measured via Area)


In [11]:
# use the geopandas overlay function with the intersection argument to get the intersection 
# between the building polygons and the predicted polygons
overlay = gpd.overlay(marikina_df, polygon_results, how = 'intersection')

In [12]:
#get the area of intersection and then sort by descending order
overlay['area'] = overlay.area
overlay.sort_values('area', ascending=False, inplace=True)


  overlay['area'] = overlay.area


In [14]:
#Only get the largest intersection between multiple intersections between a rooftop segment and building polygons
overlay_new = overlay[overlay['area'] == overlay.groupby('segment_id')['area'].transform('max')]
overlay_new_df_final = overlay_new.set_index("segment_id").reset_index()

In [17]:
#only get the necessary columns
overlay_new_df_final = overlay_new_df_final.drop(["geometry",'image_id','category', 'file_name'], axis = 1)

In [18]:
#join the dataframes to match the segments to the buildings
overlay_new_df_final = overlay_new_df_final.join(polygon_results.set_index("segment_id"), on = "segment_id")
overlay_new_df_final = overlay_new_df_final.set_index("segment_id").reset_index()

Unnamed: 0,segment_id,id,address,area,image_id,category,file_name,geometry
0,90215,way/874972048,"C.M. Recto Street, 20 Phase 1, Bonanza, Fortun...",1.837312e-07,2986,12,marikinasatellite14.66189491686551_121.1236501...,"POLYGON ((121.12343 14.66166, 121.12343 14.661..."
1,112141,way/874972048,"C.M. Recto Street, 20 Phase 1, Bonanza, Fortun...",1.412839e-07,3707,5,marikinasatellite14.66189491686551_121.1229363...,"POLYGON ((121.12282 14.66165, 121.12281 14.661..."
2,53649,way/871081677,"128 Santan, Marikina, Metro Manila, Philippines",1.221793e-07,1775,13,marikinasatellite14.659822882747488_121.125791...,"POLYGON ((121.12559 14.65968, 121.12559 14.659..."
3,90212,way/874972048,"C.M. Recto Street, 20 Phase 1, Bonanza, Fortun...",1.189207e-07,2986,5,marikinasatellite14.66189491686551_121.1236501...,"POLYGON ((121.12332 14.66190, 121.12332 14.661..."
4,19982,way/16827525,"Unit 2128, Riverbank Center, Riverbanks Avenue...",1.126035e-07,653,3,marikinasatellite14.631505083134517_121.082964...,"POLYGON ((121.08306 14.63121, 121.08306 14.631..."
...,...,...,...,...,...,...,...,...
111502,64936,way/120433106,"5 Shorthorn St, Marikina, 1800 Metro Manila, P...",2.292201e-16,2128,8,marikinasatellite14.64324660980331_121.1229363...,"POLYGON ((121.12277 14.64347, 121.12277 14.643..."
111503,90342,way/160697218,"16 St Bernadette, Marikina, 1803 Metro Manila,...",1.678017e-16,2991,2,marikinasatellite14.632886439213197_121.087961...,"POLYGON ((121.08778 14.63300, 121.08778 14.633..."
111504,23247,way/119756773,"7 Topaz, Marikina, 1800 Rizal, Philippines",9.655710e-17,764,11,marikinasatellite14.636339829409904_121.120795...,"POLYGON ((121.12089 14.63661, 121.12089 14.636..."
111505,83166,way/160697167,"1800 Liverpool, Marikina, 1800 Metro Manila, P...",1.094591e-17,2746,11,marikinasatellite14.631505083134517_121.090816...,"POLYGON ((121.09053 14.63163, 121.09053 14.631..."


In [20]:
overlay_new_df_final.rename(columns={'id': 'building_id', 'category': 'category_id'}, inplace=True)
overlay_new_df_final = overlay_new_df_final.drop(["file_name", "area"], axis=1)
overlay_new_df_final

Unnamed: 0,segment_id,building_id,address,image_id,category_id,geometry
0,90215,way/874972048,"C.M. Recto Street, 20 Phase 1, Bonanza, Fortun...",2986,12,"POLYGON ((121.12343 14.66166, 121.12343 14.661..."
1,112141,way/874972048,"C.M. Recto Street, 20 Phase 1, Bonanza, Fortun...",3707,5,"POLYGON ((121.12282 14.66165, 121.12281 14.661..."
2,53649,way/871081677,"128 Santan, Marikina, Metro Manila, Philippines",1775,13,"POLYGON ((121.12559 14.65968, 121.12559 14.659..."
3,90212,way/874972048,"C.M. Recto Street, 20 Phase 1, Bonanza, Fortun...",2986,5,"POLYGON ((121.12332 14.66190, 121.12332 14.661..."
4,19982,way/16827525,"Unit 2128, Riverbank Center, Riverbanks Avenue...",653,3,"POLYGON ((121.08306 14.63121, 121.08306 14.631..."
...,...,...,...,...,...,...
111502,64936,way/120433106,"5 Shorthorn St, Marikina, 1800 Metro Manila, P...",2128,8,"POLYGON ((121.12277 14.64347, 121.12277 14.643..."
111503,90342,way/160697218,"16 St Bernadette, Marikina, 1803 Metro Manila,...",2991,2,"POLYGON ((121.08778 14.63300, 121.08778 14.633..."
111504,23247,way/119756773,"7 Topaz, Marikina, 1800 Rizal, Philippines",764,11,"POLYGON ((121.12089 14.63661, 121.12089 14.636..."
111505,83166,way/160697167,"1800 Liverpool, Marikina, 1800 Metro Manila, P...",2746,11,"POLYGON ((121.09053 14.63163, 121.09053 14.631..."


In [21]:
#only keep the first match in case of the same area value match
overlay_new_df_final= overlay_new_df_final.drop_duplicates(subset="segment_id", keep='first') 

## Filter out the buildings whose predicted polygons area is greater than 25%

In [24]:
#use the dataframe generated as the predicted polygons dataframe
building_gpd = overlay_new_df_final

# reproject the geodatagrame to get the area in terms of meters
reprojected_building_gpd = building_gpd.to_crs(crs=3857)

# get the area
reprojected_building_gpd['pred_poly_area'] = reprojected_building_gpd['geometry'].area

In [25]:
# get the sum of the area of the predicted polygons for each building
new_building_gpd = reprojected_building_gpd.groupby('building_id')['pred_poly_area'].sum().to_frame().reset_index()


Unnamed: 0,building_id,pred_poly_area
0,way/101673369,469.325851
1,way/102253416,25925.976766
2,way/102524380,800.422618
3,way/102524386,226.878903
4,way/102524393,943.777901
...,...,...
40089,way/97918667,1046.034436
40090,way/97918675,66.205981
40091,way/97918739,612.904012
40092,way/98395128,94.868122


In [26]:
poly_gdf = marikina_buildings_data

#reproject the marikina building polygons to 3857 to get the area in terms of meters
reprojected_poly_gdf = poly_gdf.to_crs(crs=3857)

#Keep only the id and geometry
new_poly_gdf = reprojected_poly_gdf[['id', 'geometry']]

#rename the id column to building id for joining later
new_poly_gdf = new_poly_gdf.rename(columns={"id": "building_id"})

#get the area of the marikina building data
new_poly_gdf['original_poly_area'] = new_poly_gdf['geometry'].area

In [31]:
#merge the two dataframes to create a dataframe that maps the building area to the predicted polygons area
merged_df = pd.merge(new_poly_gdf, new_building_gpd, on='building_id')

Unnamed: 0,building_id,geometry,original_poly_area,pred_poly_area
0,way/4392200,"POLYGON ((13480584.004 1647006.342, 13480552.0...",1598.149600,19.178672
1,way/4947814,"POLYGON ((13478644.040 1646590.871, 13478651.1...",15578.364263,9305.233317
2,way/4947816,"POLYGON ((13478762.216 1646842.037, 13478730.4...",43749.783744,38772.121233
3,way/5105906,"POLYGON ((13482505.457 1650642.085, 13482544.8...",69927.579938,47464.929726
4,way/5106137,"POLYGON ((13482883.286 1650411.731, 13482905.0...",1009.267343,748.235678
...,...,...,...,...
40089,way/930960295,"POLYGON ((13478899.484 1646836.963, 13478898.2...",82.930469,65.871663
40090,way/930960298,"POLYGON ((13478821.962 1646396.035, 13478814.8...",131.353450,136.735799
40091,way/931231232,"POLYGON ((13478556.821 1646268.619, 13478576.4...",229.184983,182.983034
40092,way/949706390,"POLYGON ((13480693.999 1645514.328, 13480703.3...",214.725364,198.317057


In [32]:
#get the buildings where the predicted polygons area is greater than the building polygon area
merged_df[merged_df['pred_poly_area'] > merged_df['original_poly_area']]

Unnamed: 0,building_id,geometry,original_poly_area,pred_poly_area
27,way/16826829,"POLYGON ((13480531.996 1646611.775, 13480562.4...",285.116436,370.267618
39,way/16858403,"POLYGON ((13480593.890 1646733.786, 13480595.3...",770.016070,930.884394
50,way/19718614,"POLYGON ((13483976.444 1650899.310, 13483992.4...",329.952521,519.105705
61,way/29344508,"POLYGON ((13480199.262 1646202.247, 13480200.7...",1146.535839,1162.566323
63,way/29344511,"POLYGON ((13480207.054 1646325.545, 13480207.2...",220.564276,255.340454
...,...,...,...,...
40080,way/907580937,"POLYGON ((13479734.581 1645639.612, 13479740.2...",31.360768,32.395352
40081,way/918409124,"POLYGON ((13481611.595 1651006.877, 13481605.3...",831.439880,834.723579
40083,way/918711338,"POLYGON ((13481802.864 1651570.024, 13481807.0...",136.352656,168.451373
40084,way/920097652,"POLYGON ((13482838.903 1651244.486, 13482840.6...",147.395331,149.417571


In [33]:
#define the percentage difference function to get the percentage difference between the two areas
def percentage_diff(a, b):
    return (a - b) / ((a + b) / 2.0) * 100

#apply the function on the dataframes
merged_df['pct_diff'] = percentage_diff(merged_df['original_poly_area'], merged_df['pred_poly_area'])

In [42]:
# total pred poly area greater than original poly area
overflow_df = merged_df[merged_df['pct_diff'] > 0]
# get the buildings where the percentage area difference is greater than 25% and store it in a dataframe
overflow_over_25 = overflow_df[overflow_df['pct_diff'] > 25]

#convert to list
overflow_over_25_list = overflow_over_25['building_id'].to_list()

#get the buildings not in the list
overlay_new_df_final_25= overlay_new_df_final[~overlay_new_df_final['building_id'].isin(overflow_over_25)]



In [43]:
# save the resulting file
overlay_new_df_final_25.to_file("building_assignment_output_filter25.geojson", driver="GeoJSON")