In [1]:
import io
import os
from datetime import datetime

import pandas as pd
import numpy as np

import h5py


In [2]:
all_features = pd.read_pickle(r'../data/all_features.pkl')
all_features = all_features.sort_values(by='mesh_id').reset_index(drop=True)
all_features = all_features[['mesh_id', 'latitude', 'longitude', 'area', 'prefecture']]
all_features = all_features.drop_duplicates(subset=['mesh_id'], keep='first')

reports_selected = pd.read_csv(r'../data/reports_selected.csv')

In [3]:
lat_width = 75 / 9000
lon_height = 0.0125

bottom_lat = 35.7015082
bottom_lon = 139.5221197

In [4]:
all_features = all_features[all_features['prefecture'].isin(['tokyo'])]
all_features

Unnamed: 0,mesh_id,latitude,longitude,area,prefecture
2888,53392307,35.50000,139.4625,1047954,tokyo
2889,53392308,35.50000,139.4750,1047946,tokyo
2898,53392317,35.50833,139.4625,1047847,tokyo
2907,53392326,35.51667,139.4500,1047748,tokyo
2908,53392327,35.51667,139.4625,1047740,tokyo
...,...,...,...,...,...
5079,53396107,35.83333,139.2125,1043833,tokyo
5080,53396108,35.83333,139.2250,1043824,tokyo
5081,53396112,35.84167,139.1500,1043773,tokyo
5128,53396201,35.83333,139.2625,1043796,tokyo


In [5]:
# some of the entries below aren't in Tokyo, which might be due to a mislabeling
# of the prefectures that the meshes are located in
reports_selected = reports_selected[reports_selected['mesh_id'].isin(
    list(all_features['mesh_id'])
)] # 31191 entries

# reports_selected = reports_selected[reports_selected['都道府県コード'].isin([30])] # 31385 entries
reports_selected = reports_selected.sort_values(by='mesh_id')
reports_selected

Unnamed: 0,mesh_id,latitude,longitude,都道府県コード
16575,53392307,35.506634,139.471010,30
16595,53392307,35.500013,139.468010,45
16596,53392307,35.505952,139.472430,30
16597,53392307,35.503316,139.465335,45
16599,53392307,35.502328,139.469572,30
...,...,...,...,...
60098,53395730,35.777883,139.878444,30
60099,53395730,35.778045,139.887144,30
60100,53395730,35.776147,139.878625,30
60101,53395730,35.783316,139.883573,43


In [6]:
labels_shape = (all_features.shape[0], 256, 256)
print(labels_shape)

labels = np.zeros(labels_shape)

(1378, 256, 256)


In [7]:
PADDING_X = 9 # both values in pixels
PADDING_Y = 9

In [8]:
for label_idx, (name, group) in enumerate(reports_selected.groupby('mesh_id')):
    mesh_lat = all_features.iloc[label_idx]['latitude']
    mesh_lon = all_features.iloc[label_idx]['longitude']

    for _, row in group.iterrows():
        lat, lon = row['latitude'], row['longitude']

        lat_x = int((1 - ((lat - mesh_lat) / lat_width)) * labels_shape[1])
        lon_y = int(((lon - mesh_lon) / lon_height) * labels_shape[2])

        labels[label_idx, 
               max(lat_x - PADDING_X, 0): min(lat_x + PADDING_X + 1, labels_shape[1]), 
               max(lon_y - PADDING_Y, 0): min(lon_y + PADDING_Y + 1, labels_shape[2])] = 1

In [9]:
# hdf5_filename = '../datasets/padding/labels_tokyo_padding3.h5'
hdf5_filename = '../datasets/padding/labels_tokyo_padding9.h5'
with h5py.File(hdf5_filename, 'w') as hdf5_file:
    hdf5_file.create_dataset('labels', data=labels, compression='gzip', compression_opts=9)