# Raw Data Processing

In [1]:
import os
import glob
import joblib
import random
import pathlib
from pathlib import Path
from rich.progress import track

from typing import List, Dict

import numpy as np
import pandas as pd

## すべてのwaypointデータを１つのdataframeにまとめる

In [2]:
def get_data_from_pathtxt(filepath: pathlib.PosixPath, data_type: str, is_join_ids: bool = False) -> np.ndarray:
    with open(filepath) as f:
        lines = f.readlines()
    
    data = []
    for line in lines:
        tmp = line.strip().split('\t')
        if tmp[1] == data_type:
            data.append(tmp)

    data = np.array(data)
    # Drop data_type column.
    if data.shape[0] > 0:
        data = np.delete(data, 1, axis=1)
    # Concatenate site, floor and path.
    if is_join_ids:
        site_id = filepath.parent.parent.name
        floor_id = filepath.parent.name
        path_id = filepath.name.split(".")[0]
        site_floor_path = np.tile([site_id, floor_id, path_id], (data.shape[0], 1))
        data = np.concatenate([site_floor_path, data], axis=1)
    return data


def get_data_from_type_in_parallel(src_dir: str, data_type: str, is_join_ids: bool = False):
    data = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(get_data_from_pathtxt)(path_filepath, data_type, is_join_ids)
        for site_filepath in src_dir.glob("*")   
        for floor_filepath in site_filepath.glob("*")
        for path_filepath in floor_filepath.glob("*")
    )
    data = np.concatenate(data, axis=0)
    return data

In [3]:
def get_data_from_type_in_parallel(src_dir: str, data_type: str, is_join_ids: bool = False):
    data = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(get_data_from_pathtxt)(path_filepath, data_type, is_join_ids)
        for site_filepath in src_dir.glob("*")   
        for floor_filepath in site_filepath.glob("*")
        for path_filepath in floor_filepath.glob("*")
    )
    data = np.concatenate(data, axis=0)
    return data

In [4]:
%%time
src_dir = Path("../data/raw/train/")
waypoints = get_data_from_type_in_parallel(src_dir, "TYPE_WAYPOINT", is_join_ids=True)

waypoints

CPU times: user 7.61 s, sys: 1.11 s, total: 8.72 s
Wall time: 2min 29s


array([['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551566576', '37.889812', '154.43535'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551573569', '27.694906', '153.9801'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551581118', '16.998966', '153.33621'],
       ...,
       ['5cd56c17e2acfd2d33b6c161', 'F1', '5cf3a919a25c92000829bd8c',
        '1559470063615', '61.0884', '5.656629'],
       ['5cd56c17e2acfd2d33b6c161', 'F1', '5cf3a919a25c92000829bd8c',
        '1559470073199', '56.777397', '13.8065'],
       ['5cd56c17e2acfd2d33b6c161', 'F1', '5cf3a919a25c92000829bd8c',
        '1559470085099', '64.9779', '18.690739']], dtype='<U24')

In [5]:
_waypoints = waypoints.copy()

In [6]:
# Encode floor.
floorNums = {
    "B3": -3,
    "B2": -2,
    "B1": -1,
    "F1": 0,
    "F2": 1,
    "F3": 2,
    "F4": 3,
    "F5": 4,
    "F6": 5,
    "F7": 6,
    "F8": 7,
    "F9": 8,
    "F10": 9,
}
is_encoded_floor = np.zeros(_waypoints.shape[0]).astype(bool)
for key, val in floorNums.items():
    is_encoded_floor[_waypoints[:, 1] == key] = True
    _waypoints[:, 1] = np.char.replace(_waypoints[:, 1], key, str(val))
_waypoints = _waypoints[is_encoded_floor]

In [7]:
_waypoints[:, [1, 4, 5]].astype(float)

array([[  3.      ,  37.889812, 154.43535 ],
       [  3.      ,  27.694906, 153.9801  ],
       [  3.      ,  16.998966, 153.33621 ],
       ...,
       [  0.      ,  61.0884  ,   5.656629],
       [  0.      ,  56.777397,  13.8065  ],
       [  0.      ,  64.9779  ,  18.690739]])

In [8]:
# %%time

# src_dir = Pathth("../data/raw/train/")

# waypoints = []
# for site_filepath in src_dir.glob("*"):
#     for floor_filepath in site_filepath.glob("*"):
#         for path_filepath in floor_filepath.glob("*"):
#             waypoint = get_data_from_pathtxt(path_filepath, "TYPE_WAYPOINT")
#             waypoints.append(waypoint)
        
# np.concatenate(waypoints, axis=0)

# # CPU times: user 5min 57s, sys: 37.3 s, total: 6min 35s
# # Wall time: 8min 23s

In [9]:
%%time

src_dir = Path("../data/raw/train/")

# Data columns is (site, floor, path. timestamp, x, y).
waypoints = get_data_from_type_in_parallel(src_dir, "TYPE_WAYPOINT", is_join_ids=True)
np.save("tmp/train_waypoint.npy", waypoints)

# CPU times: user 2.47 s, sys: 352 ms, total: 2.82 s
# Wall time: 30.1 s

CPU times: user 7.83 s, sys: 1.09 s, total: 8.92 s
Wall time: 2min 33s


In [10]:
waypoints = np.load("tmp/train_waypoint.npy")
waypoints[:5]

array([['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551566576', '37.889812', '154.43535'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551573569', '27.694906', '153.9801'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551581118', '16.998966', '153.33621'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dac3de918410e00067e7244',
        '1571568619480', '36.867283', '179.88359'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dac3de918410e00067e7244',
        '1571568624436', '37.37246', '185.84445']], dtype='<U24')

## Wifi Feature

In [11]:
waypoints = np.load("tmp/train_waypoint.npy")

In [12]:
%%time
def get_wifi_from_waypoints(waypoint, max_len=100):
    (site, floor, path, timestamp, x, y) = waypoint
    path_filepath = pathlib.Path(f"../data/raw/train/{site}/{floor}/{path}.txt")
    # wifi is (timestamp, ssid, bssid, rssi, frequency, last_seen_timestamp)
    wifi = get_data_from_pathtxt(path_filepath, "TYPE_WIFI")
    print(wifi.shape)

    data = np.concatenate(
        [
            np.tile("nan", (1, 100)).astype("<U40"),  # bssid
            np.tile("-999", (1, 100)).astype("<U40"),  # rssi
            np.tile("-999", (1, 100)).astype("<U40"),  # frequency
            np.tile("999", (1, 100)).astype("<U40"),  # ts_diff
            np.tile("999", (1, 100)).astype("<U40"),  # last_seen_ts_diff
        ],
        axis=0,
    )

    if len(wifi) > 0:
        ts_diff = wifi[:, 0].astype("int64") - timestamp.astype("int64")
        last_seen_ts_diff = wifi[:, 5].astype("int64") - timestamp.astype("int64")
        # Add ts_diff and last_seen_ts_diff as feature.
        wifi = np.concatenate([wifi, ts_diff.reshape(-1, 1)], axis=1)
        wifi = np.concatenate([wifi, last_seen_ts_diff.reshape(-1, 1)], axis=1)
        # Extract latest values, except feature information.
        wifi = wifi[(ts_diff < 0)]
        # Extract columns of (bssid, rssi, frequency, ts_diff, last_seen_ts_diff).
        wifi = wifi[:, [2, 3, 4, 6, 7]]
        end_idx = min(max_len, wifi.T.shape[1])
        data[:, :end_idx]  = wifi.T[:, :end_idx]
    return data


get_wifi_from_waypoints(waypoints[2])

(400, 6)
CPU times: user 13.8 ms, sys: 4.24 ms, total: 18.1 ms
Wall time: 16.3 ms


array([['d2b9915dc73e4d333a718f8c02edae5e2a4d94f5',
        '5db8a385607a001cae8da5f069e1005f527ae7d6',
        '4c1ab193093f7057e6678f8f12f7ac4c05b95680',
        'f20391acb21826bb8f38243de772b7f3f8301f83',
        '45708a1205fbe53ae5ced9e450e0cedccf96e05a',
        '53eb5bd0a88b708c7a2ce601d221bd7483e73da6',
        'df4b30491488f5b430c156a69e4829400cbde9dc',
        'f513f2d9f3976f02601aa26e5dd46fad70742169',
        '8f4062bc086320b9fcf1b5ed873808b903e1b311',
        'c7189b28c03c50e63aaa72ec49e9572adc4837e9',
        '8d67c7c56c2655867cf665a6af857fb3305c5fe0',
        '3123165047af9eb204cf091e3e61141dc16ff194',
        '8701c3be87dce3f1d42512a3e8ba3ffaa283b8ea',
        '83000082f8f021c6345db980100e8c4e382139d6',
        '8b56227a675cbd21eacb7665252ac7af30082171',
        'f731c6c3b190c25d1deb595d6c4e29f97c2b194f',
        '01f689d3d53c42072e9ee44f5c648f932cf4530a',
        '55671b3896338a58e16317c9bb6e491500c53a0d',
        '1a515734ca09fb53596e421c9057d12b330bf89a',
        'bfe

In [18]:
from utils import load_pickle

def get_test_wifi_from_waypoints(
    waypoint: np.ndarray, max_len: int = 100
) -> np.ndarray:
    (site, floor, path, timestamp, x, y) = waypoint
    path_filepath = pathlib.Path(f"../data/raw/test/{path}.txt")
    wifi = get_data_from_pathtxt(path_filepath, "TYPE_WIFI")

    data = np.concatenate(
        [
            np.tile("nan", (1, 100)).astype("<U40"),  # bssid
            np.tile("-999", (1, 100)).astype("<U40"),  # rssi
            np.tile("-999", (1, 100)).astype("<U40"),  # frequency
            np.tile("999", (1, 100)).astype("<U40"),  # ts_diff
            np.tile("999", (1, 100)).astype("<U40"),  # last_seen_ts_diff
        ],
        axis=0,
    )

    if len(wifi) > 0:
        ts_diff = wifi[:, 0].astype("int64") - int(timestamp)
        last_seen_ts_diff = wifi[:, 5].astype("int64") - int(timestamp)
        # Add ts_diff and last_seen_ts_diff as feature.
        wifi = np.concatenate([wifi, ts_diff.reshape(-1, 1)], axis=1)
        wifi = np.concatenate([wifi, last_seen_ts_diff.reshape(-1, 1)], axis=1)
        # Extract latest values, except feature information.
        wifi = wifi[(ts_diff < 0)]
        # Extract columns of (bssid, rssi, frequency, ts_diff, last_seen_ts_diff).
        wifi = wifi[:, [2, 3, 4, 6, 7]]
        end_idx = min(max_len, wifi.T.shape[1])
        data[:, :end_idx] = wifi.T[:, :end_idx]
    return data

waypoints = load_pickle("../data/working/test_waypint.pkl")

ModuleNotFoundError: No module named 'utils'