In [40]:
import pandas as pd
import numpy as np
import os
import graph
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
def train_val_test_split(X, partitions, ratio=True):
    """split the input into train, test, val partitions"""
    assert len(partitions) >= 2
    assert type(X) is pd.DataFrame
    if ratio:
        input_len = len(X)
        return X.iloc[:input_len * partitions[0],:], X.iloc[input_len * partitions[0]:input_len * partitions[1],:], X.iloc[input_len * partitions[1]:, :]
    else:
        return X.iloc[:partitions[0],:], X.iloc[partitions[0]:partitions[0] + partitions[1],:], X.iloc[partitions[0] + partitions[1]:, :]
    
def train_val_test_to_file(splitted_data, dirname, index_label=None):
    assert len(splitted_data) == 3
    [train, val, test] = splitted_data
    train.to_csv(os.path.join(dirname, "train.csv"), index_label=index_label)
    val.to_csv(os.path.join(dirname, "val.csv"), index_label=index_label)
    test.to_csv(os.path.join(dirname, "test.csv"), index_label=index_label)

In [67]:
datasets_dir = "datasets/japan"

X_fname = os.path.join(datasets_dir, "data.csv")
sensor_metadata_fname = os.path.join(datasets_dir, "amd_master.tsv")
X = pd.read_csv(X_fname, index_col="datetime", parse_dates=["datetime"])
sensor_metadata = pd.read_csv(sensor_metadata_fname, delimiter="\t")
sensor_metadata = sensor_metadata.loc[sensor_metadata.aid.isin(list(X)), :].reset_index(drop=True)

In [70]:
dist, idx = graph.distance_scipy_spatial(sensor_metadata.loc[:,["lat1", "lng1", "alt"]], k = 20)

In [71]:
tokyo_index = sensor_metadata[sensor_metadata.name == "東京"].index.values[0]
extract_indices = idx[tokyo_index][:-1]
extract_indices = np.insert(extract_indices, 0, tokyo_index)

In [72]:
extract_aids = [str(s) for s in sensor_metadata.loc[extract_indices, "aid"]]
X = X.loc[:, extract_aids]

In [78]:
X = X.iloc[:-9,:] # drop the last 9 rows which prevent even division`

In [79]:
X.shape

(46000, 20)

In [80]:
train, val, test = train_val_test_split(X, [30000, 8000, 8000], ratio=False)

In [81]:
train_val_test_to_file([train, val, test], datasets_dir, index_label="datetime")