# Raw Data Processing

In [1]:
import os
import glob
import joblib
import random
import pathlib
from pathlib import Path
from rich.progress import track

from typing import List, Dict

import numpy as np
import pandas as pd

## すべてのwaypointデータを１つのdataframeにまとめる

In [2]:
def get_data_from_pathtxt(filepath: pathlib.PosixPath, data_type: str, is_join_ids: bool = False) -> np.ndarray:
    with open(filepath) as f:
        lines = f.readlines()
    
    data = []
    for line in lines:
        tmp = line.strip().split('\t')
        if tmp[1] == data_type:
            data.append(tmp)

    data = np.array(data)
    # Drop data_type column.
    if data.shape[0] > 0:
        data = np.delete(data, 1, axis=1)
    # Concatenate site, floor and path.
    if is_join_ids:
        site_id = filepath.parent.parent.name
        floor_id = filepath.parent.name
        path_id = filepath.name.split(".")[0]
        site_floor_path = np.tile([site_id, floor_id, path_id], (data.shape[0], 1))
        data = np.concatenate([site_floor_path, data], axis=1)
    return data


def get_data_from_type_in_parallel(src_dir: str, data_type: str, is_join_ids: bool = False):
    data = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(get_data_from_pathtxt)(path_filepath, data_type, is_join_ids)
        for site_filepath in src_dir.glob("*")   
        for floor_filepath in site_filepath.glob("*")
        for path_filepath in floor_filepath.glob("*")
    )
    data = np.concatenate(data, axis=0)
    return data

In [3]:
# %%time

# src_dir = Pathth("../data/raw/train/")

# waypoints = []
# for site_filepath in src_dir.glob("*"):
#     for floor_filepath in site_filepath.glob("*"):
#         for path_filepath in floor_filepath.glob("*"):
#             waypoint = get_data_from_pathtxt(path_filepath, "TYPE_WAYPOINT")
#             waypoints.append(waypoint)
        
# np.concatenate(waypoints, axis=0)

# # CPU times: user 5min 57s, sys: 37.3 s, total: 6min 35s
# # Wall time: 8min 23s

In [4]:
%%time

src_dir = Path("../data/raw/train/")

# Data columns is (site, floor, path. timestamp, x, y).
waypoints = get_data_from_type_in_parallel(src_dir, "TYPE_WAYPOINT", is_join_ids=True)
np.save("tmp/train_waypoint.npy", waypoints)

# CPU times: user 2.47 s, sys: 352 ms, total: 2.82 s
# Wall time: 30.1 s

CPU times: user 5.92 s, sys: 964 ms, total: 6.88 s
Wall time: 2min 8s


In [5]:
waypoints = np.load("tmp/train_waypoint.npy")
waypoints[:5]

array([['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551566576', '37.889812', '154.43535'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551573569', '27.694906', '153.9801'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba',
        '1571551581118', '16.998966', '153.33621'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dac3de918410e00067e7244',
        '1571568619480', '36.867283', '179.88359'],
       ['5da138764db8ce0c98bcaa46', 'F4', '5dac3de918410e00067e7244',
        '1571568624436', '37.37246', '185.84445']], dtype='<U24')

## １レコードのwaypointごとにwifiデータの特徴量を生成する

In [6]:
waypoints = np.load("tmp/train_waypoint.npy")

In [15]:
data = np.concatenate(
        [
            np.tile("nan", (1, 100)).astype("<U40"),  # bssid
            np.tile("-999", (1, 100)).astype("<U40"),  # rssi
            np.tile("0", (1, 100)).astype("<U40"),  # frequency
        ],
        axis=0,
    )

data

array([['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan'],
       ['-999', '-999', '-999', '-999', '-999', '-999', '-999', '-999',
        '-999', '-999', '-999', '-999', '-999', '-999', '-999', '-999',
        '-999', '-999', '-999', '-999', '-999', '-999', '-9

In [12]:
%%time

from tqdm.notebook import tqdm

max_len = 100

waypoint = waypoints[0]

(site, floor, path, timestamp, x, y) = waypoint
path_filepath = Path(f"../data/raw/train/{site}/{floor}/{path}.txt")
wifi = get_data_from_pathtxt(path_filepath, "TYPE_WIFI")

extract_idx = [2, 3, 4]
data = np.concatenate(
        [
            np.tile("nan", (1, 100)).astype("<U40"),  # bssid
            np.tile("-999", (1, 100)).astype("<U40"),  # rssi
            np.tile("0", (1, 100)).astype("<U40"),  # frequency
        ],
        axis=0,
    )

if len(wifi) > 0:
    ts_diff = wifi[:, 0].astype('int64') - timestamp.astype('int64')
    ts_diff_min =  np.abs(np.min(ts_diff))
    # Extract latest values, except feature information.
    wifi = wifi[(ts_diff <= ts_diff_min)]
    # Extract columns of (bssid, rssi, frequency).a
    wifi = wifi[:, extract_idx]
    # Sort values by rssi.
    sort_idx = np.argsort(wifi[:, 1])
    wifi = wifi[sort_idx]

    data = np.full((len(extract_idx), max_len), np.nan, dtype='<U40')
    data[:, :wifi.T.shape[1]] = wifi.T[:, :max_len]

data

CPU times: user 8.46 ms, sys: 2.01 ms, total: 10.5 ms
Wall time: 9 ms


array([['d2b9915dc73e4d333a718f8c02edae5e2a4d94f5',
        '5db8a385607a001cae8da5f069e1005f527ae7d6',
        '4c1ab193093f7057e6678f8f12f7ac4c05b95680',
        'f20391acb21826bb8f38243de772b7f3f8301f83',
        '45708a1205fbe53ae5ced9e450e0cedccf96e05a',
        '53eb5bd0a88b708c7a2ce601d221bd7483e73da6',
        'df4b30491488f5b430c156a69e4829400cbde9dc',
        'f513f2d9f3976f02601aa26e5dd46fad70742169',
        '8f4062bc086320b9fcf1b5ed873808b903e1b311',
        'c7189b28c03c50e63aaa72ec49e9572adc4837e9',
        '8d67c7c56c2655867cf665a6af857fb3305c5fe0',
        '8701c3be87dce3f1d42512a3e8ba3ffaa283b8ea',
        '3123165047af9eb204cf091e3e61141dc16ff194',
        '83000082f8f021c6345db980100e8c4e382139d6',
        '8b56227a675cbd21eacb7665252ac7af30082171',
        'f731c6c3b190c25d1deb595d6c4e29f97c2b194f',
        '01f689d3d53c42072e9ee44f5c648f932cf4530a',
        '55671b3896338a58e16317c9bb6e491500c53a0d',
        '1a515734ca09fb53596e421c9057d12b330bf89a',
        'bfe

In [7]:
%%time

from tqdm.notebook import tqdm

max_len = 100
wifi_features = []

for waypoint in tqdm(waypoints[:300]):
    (site, floor, path, timestamp, x, y) = waypoint
    path_filepath = Path(f"../data/raw/train/{site}/{floor}/{path}.txt")
    wifi = get_data_from_pathtxt(path_filepath, "TYPE_WIFI")
    
    extract_idx = [2, 3, 4]
    data = np.full((len(extract_idx), max_len), np.nan, dtype='<U40')
    
    if len(wifi) > 0:
        ts_diff = wifi[:, 0].astype('int64') - timestamp.astype('int64')
        ts_diff_min =  np.abs(np.min(ts_diff))
        # Extract latest values, except feature information.
        wifi = wifi[(ts_diff <= ts_diff_min)]
        # Extract columns of (bssid, rssi, frequency).
        wifi = wifi[:, extract_idx]
        # Sort values by rssi.
        sort_idx = np.argsort(wifi[:, 1])
        wifi = wifi[sort_idx]
        
        data = np.full((len(extract_idx), max_len), np.nan, dtype='<U40')
        data[:, :wifi.T.shape[1]] = wifi.T[:, :max_len]
    
    wifi_features.append(data)

  0%|          | 0/300 [00:00<?, ?it/s]

CPU times: user 5.49 s, sys: 474 ms, total: 5.96 s
Wall time: 6.12 s


In [8]:
%%time

max_len = 100


def get_wifi_from_waypoints(waypoint):
    (site, floor, path, timestamp, x, y) = waypoint
    path_filepath = Path(f"../data/raw/train/{site}/{floor}/{path}.txt")
    wifi = get_data_from_pathtxt(path_filepath, "TYPE_WIFI")
    
    extract_idx = [2, 3, 4]
    data = np.full((len(extract_idx), max_len), np.nan, dtype='<U40')
    
    if len(wifi) > 0:
        ts_diff = wifi[:, 0].astype('int64') - timestamp.astype('int64')
        ts_diff_min =  np.abs(np.min(ts_diff))
        # Extract latest values, except feature information.
        wifi = wifi[(ts_diff <= ts_diff_min)]
        # Extract columns of (bssid, rssi, frequency).
        wifi = wifi[:, extract_idx]
        # Sort values by rssi.
        sort_idx = np.argsort(wifi[:, 1])
        wifi = wifi[sort_idx]
        
        data = np.full((len(extract_idx), max_len), np.nan, dtype='<U40')
        data[:, :wifi.T.shape[1]] = wifi.T[:, :max_len]
    return data

def get_wifi_from_waypoints_in_parallel(waypoints: np.ndarray) -> np.ndarray:
    data = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(get_wifi_from_waypoints)(waypoint) for waypoint in tqdm(waypoints)
    )
    data = np.array(data)
    return data


wifi_features = get_wifi_from_waypoints_in_parallel(waypoints)

  0%|          | 0/166683 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
np.save("tmp/train_wifi_features.npy", wifi_features)

### todo

- test data でも同様の処理を行う
- wifiデータにおける全bssidを取得して、label_encode用のマップを用意する
- bssid map を使ってwifiデータのlabel_encodeを行う

In [73]:
test_waypoint = pd.read_csv("../data/raw/sample_submission.csv")
test_waypoint[['site', 'path', 'timestamp']] = test_waypoint['site_path_timestamp'].str.split('_', expand=True)

test_waypoint.drop(['site_path_timestamp'], axis=1, inplace=True)

test_waypoint = test_waypoint[['site', 'floor', 'path', 'timestamp', 'x', 'y']]
test_waypoint = test_waypoint.astype(str)

In [102]:
%%time

max_len = 100

def get_test_wifi_from_waypoints(waypoint):
    (site, floor, path, timestamp, x, y) = waypoint
    path_filepath = Path(f"../data/raw/test/{path}.txt")
    wifi = get_data_from_pathtxt(path_filepath, "TYPE_WIFI")
    
    extract_idx = [2, 3, 4]
    data = np.full((len(extract_idx), max_len), np.nan, dtype='<U40')
    
    if len(wifi) > 0:
        ts_diff = wifi[:, 0].astype('int64') - int(timestamp)
        ts_diff_min =  np.abs(np.min(ts_diff))
        # Extract latest values, except feature information.
        wifi = wifi[(ts_diff <= ts_diff_min)]
        # Extract columns of (bssid, rssi, frequency).
        wifi = wifi[:, extract_idx]
        # Sort values by rssi.
        sort_idx = np.argsort(wifi[:, 1])
        wifi = wifi[sort_idx]
        
        data = np.full((len(extract_idx), max_len), np.nan, dtype='<U40')
        data[:, :wifi.T.shape[1]] = wifi.T[:, :max_len]
    return data

def get_test_wifi_from_waypoints_in_parallel(waypoints: np.ndarray) -> np.ndarray:
    data = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(get_test_wifi_from_waypoints)(waypoint) for waypoint in track(test_waypoint.to_numpy())
    )
    data = np.array(data)
    return data


temp = test_waypoint.to_numpy()
test_wifi_features = get_test_wifi_from_waypoints_in_parallel(temp)

Output()

CPU times: user 47.1 s, sys: 7.54 s, total: 54.7 s
Wall time: 5min 31s


In [103]:
test_wifi_features

array([[['eebf5db207eec2f3e041f92153d789270f346821',
         '7805f319f3f591986effe78c5b41143180278f2d',
         '323607d8444900d64151ee06d164738ac727bbce', ...,
         '7ce07b16d0e006d8959ea36e0415168fac0009f8',
         'a3f167647e4406d59ebf92e99172ae25d3a27240',
         'fe3461438b7a21c85a42ffd76030ece52e11dd7d'],
        ['-45', '-46', '-46', ..., '-61', '-61', '-61'],
        ['2432', '5280', '2417', ..., '2462', '5745', '2437']],

       [['13b7aeaf441f2161481481fe67eace721cff07ab',
         'b4dbb0b30caa1d0f21b7b4185ba061556cada67a',
         'c48db7f3ed1858bb4fc191230e3d79d5eb178604', ...,
         '9f6570acae53f6cdb4e7713fb24e3085c228dffa',
         '8eb7c9848aa1e78b0da3b6f9f4f730885b0cd4f0',
         'c55c9a0ed49b5fd4be47a865f70945690139cd8e'],
        ['-41', '-41', '-41', ..., '-49', '-49', '-49'],
        ['5745', '5745', '5745', ..., '2432', '5785', '5785']],

       [['6bc91b3951089c3a225396608b138ca178479924',
         'b26914599f6d9ba16b43975394e1eeb9d82f4bab',
  