# Raw Data Processing

In [1]:
import os
import glob
import pathlib
import numpy as np
import pandas as pd
from numba import jit
import joblib
from joblib import Parallel, delayed
from rich.progress import track

from typing import Dict, List

import sys
sys.path.append("../src")

import config

from utils.common import timer
from utils.common import load_pickle, dump_pickle, save_cache
from utils.feature import FeatureStore

## Extract data each data_type from txt file

In [2]:
src_dir = pathlib.Path("../data/raw/train/")
filepaths = [
    path_filepath 
    for site_filepath in src_dir.glob("*") 
    for floor_filepath in site_filepath.glob("*") 
    for path_filepath in floor_filepath.glob("*")
]

In [3]:
filepath = filepaths[0]

site_id = filepath.parent.parent.name
floor = filepath.parent.name
path_id = filepath.name.split(".")[0]

feature = FeatureStore(
    site_id=site_id, 
    floor=floor, 
    path_id=path_id, 
    input_path="../data/raw/"
)
feature.load_all_data()

In [4]:
feature.save()

In [5]:
# attributes of site info
[c for c in dir(feature.site_info) if c[0]!="_"]

['draw_polygon',
 'features',
 'floor',
 'floor_coordinates',
 'get_site_info',
 'input_path',
 'map_type',
 'show_site_image',
 'site_height',
 'site_id',
 'site_width',
 'store_coordinates']

In [6]:
feature.site_info.site_height, feature.site_info.site_width

(124.00990096148014, 195.53674591205103)

In [7]:
feature.site_id, feature.floor, feature.path_id

('5cdac61fe403deddaf467fb5', 'F2', '5d099fc50e0fc900086ea6ed')

In [8]:
# attributes of feature store
for d_type in feature.df_types:
    print(d_type)
    display(feature[d_type].head())

accelerometer


Unnamed: 0,timestamp,x,y,z
0,1560911459421,-0.783646,2.031509,9.347565
1,1560911459440,-0.665207,1.890335,10.077377
2,1560911459459,-0.679565,1.868805,10.302307
3,1560911459478,0.196213,1.519455,10.004395
4,1560911459497,-0.068207,1.537399,10.034302


accelerometer_uncalibrated


Unnamed: 0,timestamp,x,y,z,x2,y2,z2,accuracy


beacon


Unnamed: 0,timestamp,uuid,major_id,minor_id,tx_power,rssi,distance,mac_addr
0,1560911463065,4e9d3569a79dcbd102831d1bd587aa4e868ae797,6c502d1af6f02df0a3cc79c238a470cc6aca3c91,3e65f84cccbd3f2d10608292edf324b7966126ba,-59,-89,21.518471,253e8d541b276ad1490a7f0bc59df9e069b064b1
1,1560911463320,4e9d3569a79dcbd102831d1bd587aa4e868ae797,6c502d1af6f02df0a3cc79c238a470cc6aca3c91,3e65f84cccbd3f2d10608292edf324b7966126ba,-59,-90,23.444276,253e8d541b276ad1490a7f0bc59df9e069b064b1
2,1560911464830,4e9d3569a79dcbd102831d1bd587aa4e868ae797,24a2b1456b2278c02306fbecd5c52e3b72b914f4,44fda320f16c8b2ef198c0000449f9c3cca1d126,-59,-95,35.510998,5044892d3dcfc3f7569fd304b904c998cac1407d
3,1560911465406,4e9d3569a79dcbd102831d1bd587aa4e868ae797,6c502d1af6f02df0a3cc79c238a470cc6aca3c91,3e65f84cccbd3f2d10608292edf324b7966126ba,-59,-97,41.679063,253e8d541b276ad1490a7f0bc59df9e069b064b1
4,1560911465604,4e9d3569a79dcbd102831d1bd587aa4e868ae797,24a2b1456b2278c02306fbecd5c52e3b72b914f4,44fda320f16c8b2ef198c0000449f9c3cca1d126,-59,-96,38.487306,5044892d3dcfc3f7569fd304b904c998cac1407d


gyroscope


Unnamed: 0,timestamp,x,y,z
0,1560911459421,-0.269684,0.128006,0.199692
1,1560911459440,-0.26358,0.217194,0.232681
2,1560911459459,-0.115143,0.290497,0.267502
3,1560911459478,-0.022888,0.145111,0.207031
4,1560911459497,-0.016785,-0.063797,-0.027542


gyroscope_uncalibrated


Unnamed: 0,timestamp,x,y,z
0,1560911459421,-0.215027,0.189972,0.122772
1,1560911459440,-0.356751,0.166153,0.184464
2,1560911459459,-0.350647,0.255341,0.217453
3,1560911459478,-0.202209,0.328644,0.252274
4,1560911459497,-0.109955,0.183258,0.191803


magnetic_field


Unnamed: 0,timestamp,x,y,z
0,1560911459421,-24.42,5.7,-37.86
1,1560911459440,-24.42,5.7,-37.68
2,1560911459459,-24.24,5.7,-37.62
3,1560911459478,-24.119999,5.76,-37.559998
4,1560911459497,-24.119999,5.76,-37.559998


magnetic_field_uncalibrated


Unnamed: 0,timestamp,x,y,z
0,1560911459421,-34.91211,-33.48694,-108.91876
1,1560911459440,-34.199524,-33.48694,-109.61304
2,1560911459459,-34.199524,-32.774353,-109.61304
3,1560911459478,-33.48694,-32.774353,-109.61304
4,1560911459497,-33.48694,-32.774353,-109.61304


rotation_vector


Unnamed: 0,timestamp,x,y,z
0,1560911459421,0.106812,-0.028381,-0.513611
1,1560911459440,0.105286,-0.026245,-0.511902
2,1560911459459,0.104004,-0.023315,-0.509766
3,1560911459478,0.10437,-0.020508,-0.507202
4,1560911459497,0.10437,-0.020508,-0.507202


waypoint


Unnamed: 0,timestamp,x,y
0,1560911459159,116.5598,50.62876
1,1560911468774,121.30447,52.322544
2,1560911474666,126.846664,55.822445
3,1560911488707,140.46562,61.165134
4,1560911503516,137.9615,68.22719


wifi


Unnamed: 0,timestamp,ssid,bssid,rssi,frequency,last_seen_timestamp
0,1560911461428,00b0b27c281d03e6c6875a76f31c7f4a2b9a2e75,22e9a086504ca6eed024d80304b4bb6849d9a24b,-44,5745,1560911461356
1,1560911461428,d563f3a08713c2142086c20a15526e8a390e853e,8fd9e9b86c3f89296fecf99802aa952e79fb3dfe,-43,5745,1560911461356
2,1560911461428,d563f3a08713c2142086c20a15526e8a390e853e,08613abec4fbc20f9bcd6253cf324ef403aa57b6,-59,2437,1560911461356
3,1560911461428,00b0b27c281d03e6c6875a76f31c7f4a2b9a2e75,db6f9a27675f7a08eebd8f416e0b67369fdabe83,-44,2462,1560911461356
4,1560911461428,d563f3a08713c2142086c20a15526e8a390e853e,925dda07a73c868e62a71c23bb5ad6453cdba587,-44,2462,1560911461356


---

## Create map for label encode

In [9]:
src_dir = pathlib.Path("../data/raw/train/")
filepaths = [
    path_filepath 
    for site_filepath in src_dir.glob("*") 
    for floor_filepath in site_filepath.glob("*") 
    for path_filepath in floor_filepath.glob("*")
]

### site for build

In [10]:
def create_site_map():
    def get_site_id_from_feature_store(filepath):
        path_id = filepath.name.split(".")[0]

        feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
        return feature.site_id
    
    site_ids = Parallel(n_jobs=-1)(delayed(get_site_id_from_feature_store)(filepath) for filepath in track(filepaths))
    unique_site_ids = np.unique(site_ids)
    siteId_map = {site_id: i + 1 for i, site_id in enumerate(unique_site_ids)}
    return siteId_map

In [11]:
%%time
siteId_map = create_site_map()

Output()

In [None]:
dump_pickle("./tmp/map_site_ids.pkl", siteId_map)

### bssid of wifi

In [109]:
def get_bssid_from_feature_store(filepath):
    path_id = filepath.name.split(".")[0]
    
    feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
    uniques = feature.wifi.bssid.unique()
    if len(uniques) > 0:
        return uniques
    else:
        return np.array([])

In [110]:
%%time
bssid = Parallel(n_jobs=-1)(delayed(get_bssid_from_feature_store)(filepath) for filepath in track(filepaths))

bssid = np.concatenate(bssid, axis=0)
unique_bsid = np.unique(bssid)

bssid_map = {_bssid: i + 1 for i, _bssid in enumerate(bssid)}
dump_pickle("./tmp/map_bssid.pkl", bssid_map)

Output()

CPU times: user 16.4 s, sys: 3.01 s, total: 19.4 s
Wall time: 1min 33s


---

## Simple feature engineering

In [147]:
src_dir = pathlib.Path("../data/raw/train/")
filepaths = [
    path_filepath 
    for site_filepath in src_dir.glob("*") 
    for floor_filepath in site_filepath.glob("*") 
    for path_filepath in floor_filepath.glob("*")
]

### Create waypoint

In [164]:
%%time
def create_waypoint(filepaths: List):
    def get_waypoint_from_featureStore(filepath):
        path_id = filepath.name.split(".")[0]

        feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
        wp = feature['waypoint']
        wp['site'] = feature.site_id
        wp['floor'] = feature.n_floor
        wp['path'] = feature.path_id
        if len(wp) > 0:
            return wp
        else:
            return pd.DataFrame([])

    waypoint = Parallel(n_jobs=-1)(delayed(get_waypoint_from_featureStore)(filepath) for filepath in track(filepaths))
    waypoint = pd.concat(waypoint, axis=0).reset_index(drop=True)
    waypoint = waypoint.sort_values(by=['path', 'timestamp']).reset_index(drop=True)
    return waypoint

waypoint = create_waypoint(filepaths)

Output()

CPU times: user 1min 11s, sys: 3.12 s, total: 1min 14s
Wall time: 1min 47s


In [190]:
dump_pickle('./tmp/train_waypoint.pkl', waypoint)

Dump pickle to ./tmp/train_waypoint.pkl


In [191]:
waypoint.head()

Unnamed: 0,timestamp,x,y,site,floor,path
0,1558318437984,59.747032,244.61037,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
1,1558318450619,55.36007,256.0339,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
2,1558318459913,49.93607,254.65387,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
3,1558318468827,44.06084,252.6999,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
4,1558318550185,47.224,245.42503,5cd969ba39e2fc0b4afe6fae,0,5ce215be915519000851776a


### Create build info

In [50]:
%%time
def create_build(filepaths: List):
    def get_waypoint_from_featureStore(filepath):
        path_id = filepath.name.split(".")[0]
        feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
        return site_map[feature.site_id], feature.site_info.site_height, feature.site_info.site_width
    
    site_map = load_pickle("./tmp/map_site_ids.pkl", verbose=False)
    resutls = Parallel(n_jobs=-1)(delayed(get_waypoint_from_featureStore)(filepath) for filepath in track(filepaths))
    return resutls

results = create_build(filepaths)

site_id, site_height, site_width = zip(*results)

Output()

CPU times: user 19.1 s, sys: 2.27 s, total: 21.4 s
Wall time: 1min 28s


In [56]:
site_id = np.array(site_id, dtype='int32')
site_height = np.array(site_height, dtype='float32')
site_width = np.array(site_width, dtype='float32')

In [58]:
site_id

array([199, 199, 199, ...,  96,  96,  96], dtype=int32)

### Create wifi 

In [16]:
%%time

def create_wifi():
    def get_wifi_feature(path_id, gdf):
        seq_len = 100
        bssid = []
        rssi = []
        freq = []

        feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
        wifi = feature.wifi.copy()
        wifi["bssid"] = wifi["bssid"].map(bssid_map)

        min_idx = gdf.index.min()
        max_idx = gdf.index.max()

        for i, row in gdf.iterrows():
            ts_pre_wp = gdf.loc[i - 1, "timestamp"] if i > min_idx else None
            ts_current_wp = gdf.loc[i, "timestamp"]
            ts_post_wp = gdf.loc[i + 1, "timestamp"] if (i + 1) < max_idx else None

            _wifi = wifi.copy()
            # NOTE: ターゲットとなるwaypointとその前後のwaypointの間にあるデータを取得する。
            ts_wifi = _wifi["timestamp"].values
            pre_flag = (
                np.ones(len(ts_wifi)).astype(bool)
                if ts_pre_wp == None
                else (ts_pre_wp < ts_wifi)
            )
            psot_flag = (
                np.ones(len(ts_wifi)).astype(bool)
                if ts_post_wp == None
                else (ts_wifi < ts_post_wp)
            )
            _wifi = _wifi[pre_flag & psot_flag]

            _wifi = _wifi.sort_values(by="rssi", ascending=False)
            _wifi = _wifi.head(seq_len)
            
            _bssid = np.zeros(seq_len)
            _rssi = np.tile(-999, seq_len)
            _freq = np.tile(-999, seq_len)
            
            _bssid[:len(_wifi)] = _wifi["bssid"].astype("int32").to_numpy()
            _rssi[:len(_wifi)] = _wifi["rssi"].astype("float32").to_numpy()
            _freq[:len(_wifi)] = _wifi["frequency"].astype("float32").to_numpy()
            
            bssid.append(_bssid)
            rssi.append(_rssi)
            freq.append(_freq)

        return bssid, rssi, freq

    waypoint = load_pickle("./tmp/train_waypoint.pkl", verbose=False)
    bssid_map = load_pickle("./tmp/map_bssid.pkl", verbose=False)
    results = Parallel(n_jobs=-1)(
        delayed(get_wifi_feature)(path_id, gdf)
        for path_id, gdf in track(waypoint.head(5000).groupby("path"))
    )
    return results

waypoint = load_pickle("./tmp/train_waypoint.pkl", verbose=False)
bssid_map = load_pickle("./tmp/map_bssid.pkl", verbose=False)

results = create_wifi()

Output()

In [19]:
bssid, rssi, freq = zip(*results)
bssid = np.concatenate(bssid, axis=0)
rssi = np.concatenate(rssi, axis=0)
freq = np.concatenate(freq, axis=0)

In [6]:
waypoint = load_pickle("./tmp/train_waypoint.pkl", verbose=False)

FileNotFoundError: [Errno 2] No such file or directory: './tmp/train_waypoint.pkl'

In [None]:
waypoint[0]

In [None]:
feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)