## Importing Libraries

In [5]:
import os
import json
import pandas as pd
import numpy as np
import cv2
import boto3
import glob
import matplotlib.pyplot as plt
from skimage import io
from shapely import wkt
from shapely.geometry import mapping
from collections import defaultdict

import warnings
warnings.filterwarnings("ignore")

## Download images from AWS S3 bucket

In [6]:
S3_Bucket_Name = 'hurricaneimagebucket'
Image_Directory = '../Images'
Data_Directory = '../Data'

In [7]:
# Initialize S3 client
s3_client = boto3.client('s3')

# S3 Bucket Download Function for Images and Data with Pagination
def S3_Download(S3_Bucket_Name, Image_Directory, Data_Directory):
    continuation_token = None

    while True:
        # List objects with pagination
        if continuation_token:
            S3_Response = s3_client.list_objects_v2(Bucket=S3_Bucket_Name, ContinuationToken=continuation_token)
        else:
            S3_Response = s3_client.list_objects_v2(Bucket=S3_Bucket_Name)

        if 'Contents' in S3_Response:
            for S3_Object in S3_Response['Contents']:
                file_name = S3_Object['Key']

                if file_name.endswith('.png'):  # Image file
                    file_path = os.path.join(Image_Directory, file_name)
                elif file_name.endswith('.json'):  # Data file
                    file_path = os.path.join(Data_Directory, file_name)
                else:
                    continue  

                # Create to user path
                os.makedirs(os.path.dirname(file_path), exist_ok=True)

                # Download file to user path
                s3_client.download_file(S3_Bucket_Name, file_name, file_path)

        # Check if there's more data to retrieve
        if S3_Response.get('IsTruncated'): 
            continuation_token = S3_Response['NextContinuationToken']
        else:
            break  

#S3_Download(S3_Bucket_Name, Image_Directory, Data_Directory)

In [8]:
image_file_count = sum(len(files) for _, _, files in os.walk(Image_Directory))
data_file_count = sum(len(files) for _, _, files in os.walk(Data_Directory))

print(f"Number of files in {Image_Directory}: {image_file_count}")
print(f"Number of files in {Data_Directory}: {data_file_count}")

Number of files in ../Images: 2438
Number of files in ../Data: 2438


The xBD dataset provides annotated high-resolution satellite imagery for assessing building damage, consisting of JSON files and image files. This project focuses on analyzing pre- and post-disaster imagery related to hurricanes.

## Data Loading

In [1]:
# Function to read JSON files
def load_JSON_data(Data_Directory):
    pre_data = []
    post_data = []

    for filename in os.listdir(Data_Directory):
        file_path = os.path.join(Data_Directory, filename)
        
        # Ensure the path is a file, not a directory
        if os.path.isfile(file_path):
            with open(file_path, 'r') as file:
                content = json.load(file)

            # Filter by hurricane natural disaster only
            disaster = content['metadata'].get('disaster')
            if disaster and "hurricane" in disaster.lower():
                img_name = content['metadata'].get('img_name', "")
                data = {
                    'img_name': img_name,
                    'lng_lat': content['features'].get('lng_lat'),
                    'xy': content['features'].get('xy'),
                    'sensor': content['metadata'].get('sensor'),
                    'provider_asset_type': content['metadata'].get('provider_asset_type'),
                    'gsd': content['metadata'].get('gsd'),
                    'capture_date': content['metadata'].get('capture_date'),
                    'off_nadir_angle': content['metadata'].get('off_nadir_angle'),
                    'pan_resolution': content['metadata'].get('pan_resolution'),
                    'sun_azimuth': content['metadata'].get('sun_azimuth'),
                    'sun_elevation': content['metadata'].get('sun_elevation'),
                    'target_azimuth': content['metadata'].get('target_azimuth'),
                    'disaster': disaster,
                    'disaster_type': content['metadata'].get('disaster_type'),
                    'catalog_id': content['metadata'].get('catalog_id'),
                    'original_width': content['metadata'].get('original_width'),
                    'original_height': content['metadata'].get('original_height'),
                    'width': content['metadata'].get('width'),
                    'height': content['metadata'].get('height'),
                    'id': content['metadata'].get('id')
                }
            
            # Separate pre and post DataFrame
            if "pre" in img_name.lower():
                pre_data.append(data)
            elif "post" in img_name.lower():
                post_data.append(data)
    
    hurricane_pre_df = pd.DataFrame(pre_data)
    hurricane_post_df = pd.DataFrame(post_data)
    
    return hurricane_pre_df, hurricane_post_df

In [2]:
# Function to extract pre- and post-disaster images
def extract_images(image_folder) :
    
    # List to store pre- and post-hurricane images
    pre_images, post_images = [], []
    
    print("Retrieving pre and post disaster images from:", image_folder)
    
    for image in glob.iglob(f'{image_folder}/*') :
        if image.endswith(".png") :
            if "pre" in image.lower():
                pre_images.append(image)
            elif "post" in image.lower():
                post_images.append(image)
    
    return pre_images, post_images

In [3]:
def prepare_data(images_dir, json_dir):
    hurricane_pre_df, hurricane_post_df = load_JSON_data(json_dir)
    pre_hurricane_images, post_hurricane_images = extract_images(images_dir)
    print(f"\nTotal pre-disaster images: {len(pre_hurricane_images)}")
    print(f"\nTotal post-disaster images: {len(post_hurricane_images)}")

    return hurricane_pre_df, hurricane_post_df, pre_hurricane_images, post_hurricane_images