## **II. Building Footprint analysis**

### **1. Overview of dataset**

In [1]:
import geopandas as gpd

file_path = "../input//Building Footprints_20250131.geojson"
gdf_buildings = gpd.read_file(file_path)

In [2]:
print(gdf_buildings.head())

   name    base_bbl shape_area   heightroof  mpluto_bbl cnstrct_yr  \
0  None  3065220021        0.0  29.74985318  3065220021       1925   
1  None  5012640036        0.0        22.63  5012640036       1965   
2  None  5060190091        0.0        35.76  5060190091       1970   
3  None  3086910048        0.0         37.5  3086910048       1928   
4  None  4075020005        0.0  18.01511294  4075020005       1950   

                                 globalid   lststatype feat_code groundelev  \
0  {31298F86-3088-4F53-B3DB-71A9EFA6FA1F}  Constructed      2100         40   
1  {F5F8CDA5-69E2-46F8-8F69-BA95C025B520}  Constructed      2100         39   
2  {9F644794-F72C-4582-9E5E-B337E2B97068}  Constructed      2100         51   
3  {F916B22D-E25B-44AE-9FA9-2A51191B9CDF}  Constructed      2100          6   
4  {525F2C24-616B-4F29-98A3-8FEA5D4B1A7D}  Constructed      2100         93   

   geomsource      bin lstmoddate doitt_id shape_len  \
0  Photogramm  3170958 2017-08-22    96807      

In [3]:
print(gdf_buildings.dtypes)

name                  object
base_bbl              object
shape_area            object
heightroof            object
mpluto_bbl            object
cnstrct_yr            object
globalid              object
lststatype            object
feat_code             object
groundelev            object
geomsource            object
bin                   object
lstmoddate    datetime64[ms]
doitt_id              object
shape_len             object
geometry            geometry
dtype: object


In [5]:
import pandas as pd
# Data Type Conversion
cols_to_convert = ['shape_area', 'heightroof', 'cnstrct_yr', 'groundelev']
for col in cols_to_convert:
    gdf_buildings[col] = pd.to_numeric(gdf_buildings[col], errors='coerce')

gdf_buildings['lstmoddate'] = pd.to_datetime(gdf_buildings['lstmoddate'], errors='coerce')

In [7]:
# Descriptive Statistics and Missing Value Check
print(gdf_buildings.describe())

       shape_area    heightroof    cnstrct_yr    groundelev  \
count   1082896.0  1.082894e+06  1.072674e+06  1.082347e+06   
mean          0.0  2.775331e+01  1.939697e+03  5.512668e+01   
min           0.0  0.000000e+00  1.652000e+03 -1.600000e+01   
25%           0.0  1.853000e+01  1.920000e+03  2.500000e+01   
50%           0.0  2.614000e+01  1.930000e+03  4.600000e+01   
75%           0.0  3.116000e+01  1.955000e+03  7.400000e+01   
max           0.0  1.550000e+03  2.025000e+03  1.335000e+03   
std           0.0  2.215500e+01  2.954712e+01  4.154842e+01   

                       lstmoddate  
count                     1082896  
mean   2017-10-01 21:27:30.534000  
min           2005-02-16 00:00:00  
25%           2017-08-22 00:00:00  
50%           2017-08-22 00:00:00  
75%           2017-08-22 00:00:00  
max           2025-01-24 00:00:00  
std                           NaN  


In [9]:
# Descriptive Statistics and Missing Value Check
print(gdf_buildings.isna().sum())

name          1080658
base_bbl            1
shape_area          0
heightroof          2
mpluto_bbl          1
cnstrct_yr      10222
globalid            0
lststatype        342
feat_code           0
groundelev        549
geomsource        311
bin                 1
lstmoddate          0
doitt_id            0
shape_len           0
geometry            0
dtype: int64


In [10]:
# CRS Check
print(gdf_buildings.crs)

EPSG:4326


### **2. Feature engineering**

In [14]:
# 1. Reproject to EPSG:2263 (NAD83 / New York Long Island (ftUS))
gdf_buildings = gdf_buildings.to_crs(epsg=2263)

# 2. Calculate the area (now in square feet)
gdf_buildings['calculated_area_sqft'] = gdf_buildings.geometry.area

# 3. Convert square feet to square meters: 1 sq ft = 0.092903 sq m
gdf_buildings['calculated_area_sqm'] = gdf_buildings['calculated_area_sqft'] * 0.092903

In [15]:
print(gdf_buildings["calculated_area_sqm"].head())

0    152.188049
1     70.012084
2     66.792573
3    108.479500
4    163.017750
Name: calculated_area_sqm, dtype: float64


In [16]:
# 4.  Outliers handling. Replace value exceeding 99.9 percentile with 99.9 percentile value
max_height = gdf_buildings['heightroof'].quantile(0.999)
gdf_buildings.loc[gdf_buildings['heightroof'] > max_height, 'heightroof'] = max_height

max_elevation = gdf_buildings['groundelev'].quantile(0.999)
gdf_buildings.loc[gdf_buildings['groundelev'] > max_elevation, 'groundelev'] = max_elevation

In [17]:
print(gdf_buildings[['shape_area', 'calculated_area_sqft', 'calculated_area_sqm', 'heightroof', 'groundelev']].describe())

       shape_area  calculated_area_sqft  calculated_area_sqm    heightroof  \
count   1082896.0          1.082896e+06         1.082896e+06  1.082894e+06   
mean          0.0          1.608273e+03         1.494134e+02  2.760884e+01   
std           0.0          5.664603e+03         5.262586e+02  1.917501e+01   
min           0.0          3.247649e+01         3.017164e+00  0.000000e+00   
25%           0.0          6.539575e+02         6.075461e+01  1.853000e+01   
50%           0.0          9.591694e+02         8.910972e+01  2.614000e+01   
75%           0.0          1.323632e+03         1.229694e+02  3.116000e+01   
max           0.0          1.171874e+06         1.088707e+05  2.981297e+02   

         groundelev  
count  1.082347e+06  
mean   5.509129e+01  
std    4.129722e+01  
min   -1.600000e+01  
25%    2.500000e+01  
50%    4.600000e+01  
75%    7.400000e+01  
max    3.100000e+02  


In [18]:
print(gdf_buildings.isna().sum())

name                    1080658
base_bbl                      1
shape_area                    0
heightroof                    2
mpluto_bbl                    1
cnstrct_yr                10222
globalid                      0
lststatype                  342
feat_code                     0
groundelev                  549
geomsource                  311
bin                           1
lstmoddate                    0
doitt_id                      0
shape_len                     0
geometry                      0
calculated_area               0
calculated_area_sqft          0
calculated_area_sqm           0
dtype: int64


***Observation:***
Suggest the following approach to deal with missing data:

- Remove following fields/columns: name, base_bbl, mpluto_bbl, cnstrct_yr, doitt_id, geomsource, bin. I'd not use these fields/columns in the final analysis or model building.

- median impute for heightroof, groundelev

In [None]:
# Columns to remove
columns_to_remove = ['name', 'base_bbl', 'mpluto_bbl', 'cnstrct_yr', 'doitt_id', 'geomsource', 'bin', 'lststatype', 'shape_len', 'globalid', 'feat_code', 'lstmoddate', 'calculated_area_sqft']
gdf_buildings.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# Impute missing values for 'heightroof' with the median
median_height = gdf_buildings['heightroof'].median()
gdf_buildings['heightroof'] = gdf_buildings['heightroof'].fillna(median_height)

# Impute missing values for 'groundelev' with the median
median_elevation = gdf_buildings['groundelev'].median()
gdf_buildings['groundelev'] = gdf_buildings['groundelev'].fillna(median_elevation)

In [23]:
# Check missing values again
print(gdf_buildings.isna().sum())

shape_area              0
heightroof              0
groundelev              0
lstmoddate              0
geometry                0
calculated_area         0
calculated_area_sqft    0
calculated_area_sqm     0
dtype: int64


In [24]:
# Display the cleaned dataframe info
print(gdf_buildings.info())

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1082896 entries, 0 to 1082895
Data columns (total 8 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   shape_area            1082896 non-null  float64       
 1   heightroof            1082896 non-null  float64       
 2   groundelev            1082896 non-null  float64       
 3   lstmoddate            1082896 non-null  datetime64[ms]
 4   geometry              1082896 non-null  geometry      
 5   calculated_area       1082896 non-null  float64       
 6   calculated_area_sqft  1082896 non-null  float64       
 7   calculated_area_sqm   1082896 non-null  float64       
dtypes: datetime64[ms](1), float64(6), geometry(1)
memory usage: 66.1 MB
None


In [25]:
print(gdf_buildings.crs)

EPSG:2263


**Observation:**

Here are what we have achieved:
- Converted building footprint CRS to EPSG:2263.
- shape_area is corrected and in square meter(calculated_area_sqm).
- Outliers in heightroof and groundelev are handled.
- Missing values are imputed.
- Unnecessary columns are dropped.