In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# imports
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import json
from tqdm import tqdm
import math

In [None]:
# helper function to find area of building footprint - adapted for python from mapbox's geojson-area https://github.com/mapbox/geojson-area/tree/master

# WGS84 constants
RADIUS = 6378137  # Earth's radius in meters (from WGS84)

def polygon_area(geo):
    coords = geo['coordinates']
    area = 0
    if coords and len(coords) > 0:
        area += abs(ring_area(coords[0]))
        for i in range(1, len(coords)):
            area -= abs(ring_area(coords[i]))
    return area

def ring_area(coords):
    area = 0
    coords_length = len(coords)

    if coords_length > 2:
        for i in range(coords_length):
            if i == coords_length - 2:  # i = N-2
                lower_index = coords_length - 2
                middle_index = coords_length - 1
                upper_index = 0
            elif i == coords_length - 1:  # i = N-1
                lower_index = coords_length - 1
                middle_index = 0
                upper_index = 1
            else:  # i = 0 to N-3
                lower_index = i
                middle_index = i + 1
                upper_index = i + 2

            p1 = coords[lower_index]
            p2 = coords[middle_index]
            p3 = coords[upper_index]
            area += (rad(p3[0]) - rad(p1[0])) * math.sin(rad(p2[1]))

        area = area * RADIUS * RADIUS / 2

    return area

def rad(angle):
    return angle * math.pi / 180

In [None]:
# testing cell
def concatenate_data(folder: str):
  # paths
  BASE_PATH = "drive/MyDrive/semester 4/csci 1470: final project/"
  FILTERED_STREETVIEW_EMBEDDINGS = BASE_PATH + "datasetimages/" + folder + "/filtered_streetview_embeddings/"
  GEOJSON_PATH = BASE_PATH + folder + "_set.geojson"
  print(FILTERED_STREETVIEW_EMBEDDINGS)
  print(GEOJSON_PATH)

  # get geojson df
  with open(GEOJSON_PATH) as f:
      geojson_df = json.load(f)
  features = geojson_df["features"]

  feature_vectors = []
  ground_truth_labels = []
  skipped_count = 0

  # for every building in geojson
  i = 0
  for feature in tqdm(features[30000:]):
      # print(i, skipped_count)
      properties = feature["properties"]

      # find embedding paths
      address = properties["STREET_ADDRESS"].replace("/", "-").replace(",", "").replace(" ", "_")
      streetview_embedding_path = FILTERED_STREETVIEW_EMBEDDINGS + address + ".npy"
      aerial_embedding_path = BASE_PATH + "datasetimages/" + folder + "/aerial_extracted_npy/" + address + ".npy"
      # print(aerial_embedding_path)

      # if either file not found, skip
      try:
          streetview_embedding = np.load(streetview_embedding_path)
          # print("streetview found")
          aerial_embedding = np.load(aerial_embedding_path)
          # print("aerial found")
      except OSError:
          # print("skipped")
          skipped_count += 1
          continue

      # reshape to (2048,)
      streetview_embedding = streetview_embedding.reshape(-1)
      aerial_embedding = aerial_embedding.reshape(-1)

      # get lst (mean, variance, count) from the geojson
      lst = np.array([properties["_count"], properties["_mean"], properties["_variance"]])
      # print(lst)

      # get footprint area using coordinates
      area = np.array([polygon_area(features[0]["geometry"])])

      # get final feature vector
      curr_vector = np.concatenate((streetview_embedding, aerial_embedding, lst, area))
      # print(curr_vector)

      # get ground truth label
      energy_efficient = {"A", "B", "C", "D"}
      curr_label = 1 if properties["CURRENT_ENERGY_RATING"] in energy_efficient else 0

      # add to total list
      # print(np.shape(curr_vector))
      feature_vectors.append(curr_vector)
      ground_truth_labels.append(curr_label)

      i += 1

  print(f"len feature vectors: {len(feature_vectors)}")
  print(f"len ground truth: {len(ground_truth_labels)}")
  print(i, skipped_count)

  # save file to drive
  X = np.stack(feature_vectors)
  y = np.array(ground_truth_labels)
  save_path = BASE_PATH + 'final_' + folder + '_dataset_30000-end.npz'
  np.savez(save_path, X=X, y=y)

In [None]:
# run concatenation - test complete
folder = "train"
# concatenate_data(folder)

final_file = "drive/MyDrive/semester 4/csci 1470: final project/final_" + folder + "_dataset_2.npz"
data = np.load(final_file)

# access arrays using the names you saved them with
X_loaded = data['X']
y_loaded = data['y']

# arrays will be exactly as saved
print(X_loaded.shape, y_loaded.shape)  # Should match original shapes
print(X_loaded[0].shape)
print(y_loaded[0:5])

(4275, 4100) (4275,)
(4100,)
[1 0 0 1 1]


In [None]:
# final concatenation
folder = "train"
final_file = "drive/MyDrive/semester 4/csci 1470: final project/final_" + folder + "_dataset.npz"

# first file
first_file = "drive/MyDrive/semester 4/csci 1470: final project/final_train_dataset_1.npz"
data = np.load(first_file)

feature_vectors = data['X']
ground_truth_labels = data['y']


for i in range(2, 8):
  file_name = "drive/MyDrive/semester 4/csci 1470: final project/final_" + folder + "_dataset_" + str(i) + ".npz"
  print(file_name)
  data = np.load(file_name)

  X_loaded = data['X']
  y_loaded = data['y']

  print(X_loaded.shape)

  feature_vectors = np.concatenate((feature_vectors, X_loaded))
  ground_truth_labels = np.concatenate((ground_truth_labels, y_loaded))

  print(feature_vectors.shape)
  print(ground_truth_labels.shape)

np.savez(final_file, X=feature_vectors, y=ground_truth_labels)

drive/MyDrive/semester 4/csci 1470: final project/final_train_dataset_2.npz
(4275, 4100)
(8612, 4100)
(8612,)
drive/MyDrive/semester 4/csci 1470: final project/final_train_dataset_3.npz
(4328, 4100)
(12940, 4100)
(12940,)
drive/MyDrive/semester 4/csci 1470: final project/final_train_dataset_4.npz
(4339, 4100)
(17279, 4100)
(17279,)
drive/MyDrive/semester 4/csci 1470: final project/final_train_dataset_5.npz
(4360, 4100)
(21639, 4100)
(21639,)
drive/MyDrive/semester 4/csci 1470: final project/final_train_dataset_6.npz
(4296, 4100)
(25935, 4100)
(25935,)
drive/MyDrive/semester 4/csci 1470: final project/final_train_dataset_7.npz
(1987, 4100)
(27922, 4100)
(27922,)
