In [5]:
import io

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import folium
import selenium
import h5py

from PIL import Image
import cv2

In [6]:
all_features = pd.read_pickle(r'../data/all_features.pkl')
all_features = all_features.sort_values(by='mesh_id').reset_index(drop=True)
all_features = all_features[['mesh_id', 'latitude', 'longitude', 'area', 'prefecture']]
all_features = all_features.drop_duplicates(subset=['mesh_id'], keep='first')

reports_selected = pd.read_csv(r'../data/reports_selected.csv')

In [7]:
lat_width = 75 / 9000
lon_height = 0.0125

bottom_lat = 35.7015082
bottom_lon = 139.5221197

In [8]:
def gradient(img):
    kernel = np.ones((5, 5), np.uint8)
    imgf = cv2.morphologyEx(img, cv2.MORPH_GRADIENT, kernel)
    return imgf

def erosion(img):
    kernel = np.ones((3, 3), np.uint8)
    imgf = cv2.erode(img, kernel, iterations=1)
    return imgf

In [9]:
def preprocessed_image_from_coords(lat, lon, resize=True):
    # -------------------
    # Creating folium map
    # -------------------

    north = lat + lat_width
    south = lat
    east = lon + lon_height
    west = lon

    def mean(x, y):
        return (x + y) / 2

    map_obj = folium.Map(location=[
        mean(north, south), mean(east, west)], tiles='Cartodb dark_matter no_labels')

    square_coordinates = [
        (south, west),
        (north, west),
        (north, east),
        (south, east),
        (south, west)
    ]

    folium.PolyLine(
        locations=square_coordinates,
        weight=7,
    ).add_to(map_obj)

    map_obj.fit_bounds([(south, west), (north, east)])

    # ------------------------
    # Save folium map to image
    # ------------------------

    timeout = 1

    img_data = map_obj._to_png(timeout)
    img = Image.open(io.BytesIO(img_data))

    # ---------------------------
    # Detect blue box around mesh
    # ---------------------------

    image_np = np.array(img)
    border_thickness = 8

    if image_np.shape[-1] == 4:  # Handle RGBA
        image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2BGR)
    else:
        image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

    hsv = cv2.cvtColor(image_np, cv2.COLOR_BGR2HSV)
    lower_blue = np.array([100, 150, 50])
    upper_blue = np.array([140, 255, 255])

    mask = cv2.inRange(hsv, lower_blue, upper_blue)

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        raise ValueError("No blue box detected in the image.")

    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)

    x_min = x + border_thickness
    y_min = y + border_thickness
    x_max = x + w - border_thickness
    y_max = y + h - border_thickness

    y_min, y_max = min(y_min, y_max), max(y_min, y_max)
    x_min, x_max = min(x_min, x_max), max(x_min, x_max)

    cropped_np = image_np[y_min:y_max, x_min:x_max]
    cv2_image = cv2.cvtColor(cropped_np, cv2.COLOR_BGR2RGB)

    cv2_image_copy = cv2_image.copy()

    lab = cv2.cvtColor(cv2_image_copy, cv2.COLOR_BGR2LAB)
    l_channel, a, b = cv2.split(lab)

    clahe = cv2.createCLAHE(clipLimit=15.0, tileGridSize=(8, 8))
    cl = clahe.apply(l_channel)

    limg = cv2.merge((cl, a, b))

    enhanced_img = cv2.cvtColor(limg, cv2.COLOR_LAB2BGR)
    enhanced_img = gradient(enhanced_img)

    # --------------------------
    # Apply Canny edge detection
    # --------------------------

    edges = cv2.Canny(enhanced_img, 50, 150)

    if resize:
        standard_size = (256, 256)
        resized_edges = cv2.resize(edges, standard_size, interpolation=cv2.INTER_AREA)
    else:
        resized_edges = edges

    return resized_edges

In [10]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [11]:
def process_row(row):
    latitude, longitude = row.latitude, row.longitude
    return preprocessed_image_from_coords(latitude, longitude)

def create_hdf5_dataset(df, dataset_name='features', target_shape=(256, 256), batch_size=100):
    num_samples = len(df)
    hdf5_filename = fr'../datasets/{dataset_name}.h5'

    with h5py.File(hdf5_filename, 'w') as hdf5_file:
        dataset = hdf5_file.create_dataset(
            dataset_name,
            shape=(num_samples, *target_shape),
            dtype=np.uint8,
            compression='gzip',
            compression_opts=9
        )

        with ThreadPoolExecutor() as executor:
            for batch_start in tqdm(range(0, num_samples, batch_size), desc="Processing Batches"):
                batch_end = min(batch_start + batch_size, num_samples)
                batch_rows = df.iloc[batch_start:batch_end]

                batch_results = list(executor.map(process_row, batch_rows.itertuples(index=False)))

                dataset[batch_start:batch_end] = np.array(batch_results, dtype=np.uint8)

    print(f"Features dataset saved as {hdf5_filename}")

In [None]:
create_hdf5_dataset(
    all_features,
    dataset_name='features1',
    target_shape=(256, 256),
    batch_size=100
)

Processing Batches:   0%|          | 0/153 [00:00<?, ?it/s]