In [8]:
import pandas as pd
import numpy as np
import os
import graph

In [9]:
def train_val_test_split(X, partitions, ratio=True):
    """split the input into train, test, val partitions"""
    assert len(partitions) >= 2
    assert type(X) is pd.DataFrame
    if ratio:
        input_len = len(X)
        return X.iloc[:input_len * partitions[0],:], X.iloc[input_len * partitions[0]:input_len * partitions[1],:], X.iloc[input_len * partitions[1]:, :]
    else:
        return X.iloc[:partitions[0],:], X.iloc[partitions[0]:partitions[0] + partitions[1],:], X.iloc[partitions[0] + partitions[1]:, :]
    
def train_val_test_to_file(splitted_data, dirname, index_label=None):
    assert len(splitted_data) == 3
    [train, val, test] = splitted_data
    train.to_csv(os.path.join(dirname, "train.csv"), index_label=index_label)
    val.to_csv(os.path.join(dirname, "val.csv"), index_label=index_label)
    test.to_csv(os.path.join(dirname, "test.csv"), index_label=index_label)

In [10]:
datasets_dir = "datasets/japan"

X_fname = os.path.join(datasets_dir, "data.csv")
sensor_metadata_fname = os.path.join(datasets_dir, "amd_master.tsv")
X = pd.read_csv(X_fname, index_col="datetime", parse_dates=["datetime"])
sensor_metadata = pd.read_csv(sensor_metadata_fname, delimiter="\t")

In [29]:
available_sensors = list(X)
sensor_metadata = sensor_metadata.loc[sensor_metadata.aid.isin(available_sensors)]

In [32]:
sensor_metadata[]

Unnamed: 0,aid,name,lat1,lat2,lng1,lng2,alt
9,11206,浜頓別,45,7.5,142,21.0,18
10,11276,中頓別,44,57.9,142,16.8,25
49,13277,留萌,43,56.7,141,37.9,24
76,15321,美唄,43,21.8,141,49.6,16
86,16091,小樽,43,10.9,141,0.9,25
88,16156,共和,42,58.8,140,36.2,15
91,16252,寿都,42,47.7,140,13.4,33
131,18256,別海,43,22.9,145,7.1,23
132,18273,根室,43,19.8,145,35.1,25
207,23226,北斗,41,53.2,140,39.2,25


In [30]:
dist, idx = graph.distance_scipy_spatial(sensor_metadata.loc[:,["lat1", "lng1", "alt"]])

In [31]:
idx

array([[ 3,  5,  7,  2],
       [ 4,  2,  8,  9],
       [ 4,  1,  9, 20],
       [ 5,  0, 11, 19],
       [ 2,  1,  9, 12],
       [ 3, 11, 19,  0],
       [10, 13, 47, 17],
       [ 8,  1,  2,  4],
       [ 7,  1,  4,  2],
       [ 4,  2, 20, 12],
       [13,  6, 47, 22],
       [19,  5, 18,  3],
       [14,  9, 46, 21],
       [10, 47,  6, 17],
       [17, 12, 46,  9],
       [16, 24, 25, 29],
       [15, 24, 25, 29],
       [14, 46, 47, 12],
       [19, 23, 11, 36],
       [11, 18,  5,  3],
       [21,  9, 12, 46],
       [20, 46, 12,  9],
       [38, 32, 30, 27],
       [34, 18, 49, 36],
       [25, 33, 29, 39],
       [24, 39, 33, 29],
       [35, 31, 46, 53],
       [38, 30, 32, 22],
       [29, 33, 39, 24],
       [29, 28, 24, 25],
       [27, 38, 22, 32],
       [26, 35, 46, 21],
       [38, 27, 22, 30],
       [29, 28, 24, 25],
       [23, 36, 49, 54],
       [26, 31, 17, 47],
       [40, 34, 48, 23],
       [24, 33, 28, 29],
       [27, 32, 30, 22],
       [25, 28, 24, 33],


In [4]:
train, val, test = train_val_test_split(X, [30000, 8000, 8000], ratio=False)

In [5]:
train_val_test_to_file([train, val, test], datasets_dir, index_label="datetime")