# Raw Data Processing

In [2]:
import os
import glob
import pathlib
import numpy as np
import pandas as pd
from numba import jit
import joblib
from joblib import Parallel, delayed
from rich.progress import track

from typing import Dict

import sys
sys.path.append("../src")

import config

from utils.common import timer
from utils.common import load_pickle, dump_pickle, save_cache
from utils.feature import FeatureStore

## Extract data each data_type from txt file

In [2]:
src_dir = pathlib.Path("../data/raw/train/")
filepaths = [
    path_filepath 
    for site_filepath in src_dir.glob("*") 
    for floor_filepath in site_filepath.glob("*") 
    for path_filepath in floor_filepath.glob("*")
]

In [3]:
filepath = filepaths[0]

site_id = filepath.parent.parent.name
floor = filepath.parent.name
path_id = filepath.name.split(".")[0]

feature = FeatureStore(
    site_id=site_id, 
    floor=floor, 
    path_id=path_id, 
    input_path="../data/raw/"
)
feature.load_all_data()

In [4]:
feature.save()

In [5]:
# attributes of site info
[c for c in dir(feature.site_info) if c[0]!="_"]

['draw_polygon',
 'features',
 'floor',
 'floor_coordinates',
 'get_site_info',
 'input_path',
 'map_type',
 'show_site_image',
 'site_height',
 'site_id',
 'site_width',
 'store_coordinates']

In [6]:
feature.site_info.site_height, feature.site_info.site_width

(212.06273086548222, 146.74902374734265)

In [7]:
feature.site_id, feature.floor, feature.path_id

('5da138764db8ce0c98bcaa46', 'F4', '5dabfad918410e00067e70ba')

In [8]:
# attributes of feature store
for d_type in feature.df_types:
    print(d_type)
    display(feature[d_type].head())

accelerometer


Unnamed: 0,timestamp,x,y,z,accuracy
0,1571551566693,-2.201767,0.078979,11.103226,2
1,1571551566713,-2.435196,-0.066452,9.657715,2
2,1571551566733,-2.476501,-0.052094,9.203415,2
3,1571551566753,-2.560898,-0.108368,9.472763,2
4,1571551566773,-2.646484,-0.138885,9.753494,2


accelerometer_uncalibrated


Unnamed: 0,timestamp,x,y,z,x2,y2,z2,accuracy
0,1571551566693,-2.028198,0.10173,11.459961,0.0,0.0,0.0,3
1,1571551566713,-2.363373,0.041275,10.371185,0.0,0.0,0.0,3
2,1571551566733,-2.49086,-0.076035,9.227966,0.0,0.0,0.0,3
3,1571551566753,-2.508224,-0.073044,9.332108,0.0,0.0,0.0,3
4,1571551566773,-2.547729,-0.161636,9.611633,0.0,0.0,0.0,3


beacon


Unnamed: 0,timestamp,uuid,major_id,minor_id,tx_power,rssi,distance,mac_addr,timestamp2
0,1571551566648,726388c292214e8a10ee4ae58707e9972ef10261,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c,5d17fd19490538517917e53daa91d76ca198a330,-59,-96,38.487306,d8bfaed2678b820e03dbadf6044cb50489546ca4,1571551566648
1,1571551566663,07efd69e3167537492f0ead89fb2779633b04949,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c,e9685cf33b27028baae03480e332e453ace2abfb,-59,-61,1.274439,4e4198d4d7a0591fe48f96866c618e44529d149e,1571551566663
2,1571551566672,07efd69e3167537492f0ead89fb2779633b04949,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c,78b6e7d58093116dc47562f8f18de790420a52be,-59,-96,38.487306,f04318afc2e7152a9ce8a0dc2e3a424e53cc9cd3,1571551566672
3,1571551567914,07efd69e3167537492f0ead89fb2779633b04949,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c,9fa8f48b6d90959002600d2528fc31ef5e005e76,-59,-74,5.270232,6b35396ab63cf1bf0f5ba307abd620d119c5c42a,1571551567914
4,1571551569380,726388c292214e8a10ee4ae58707e9972ef10261,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c,d3ec16d1868edc7da81be9fc203f819281c0e4ba,-59,-80,9.521558,3eee0b39d4d266756b2ee5eae6471fef8f034c03,1571551569380


gyroscope


Unnamed: 0,timestamp,x,y,z,accuracy
0,1571551566693,0.177216,-0.329681,-0.150436,3
1,1571551566713,0.32901,-0.111832,-0.017807,3
2,1571551566733,0.285339,0.205093,0.127609,3
3,1571551566753,0.169754,0.386719,0.215485,3
4,1571551566773,0.027542,0.324402,0.224014,3


gyroscope_uncalibrated


Unnamed: 0,timestamp,x,y,z,x2,y2,z2,accuracy
0,1571551566693,-0.047821,-0.427567,-0.200974,-0.000793,0.000122,6.1e-05,3
1,1571551566713,0.328217,-0.11171,-0.017746,-0.000793,0.000122,6.1e-05,3
2,1571551566733,0.328217,-0.11171,-0.017746,-0.000793,0.000122,6.1e-05,3
3,1571551566753,0.284546,0.205215,0.12767,-0.000793,0.000122,6.1e-05,3
4,1571551566773,0.026749,0.324524,0.224075,-0.000793,0.000122,6.1e-05,3


magnetic_field


Unnamed: 0,timestamp,x,y,z,accuracy
0,1571551566693,35.313416,0.90332,-27.876282,3
1,1571551566713,37.394714,0.90332,-26.535034,3
2,1571551566733,36.00769,2.290344,-27.204895,3
3,1571551566753,36.701965,0.90332,-27.204895,3
4,1571551566773,35.313416,1.596069,-27.204895,3


magnetic_field_uncalibrated


Unnamed: 0,timestamp,x,y,z,x2,y2,z2,accuracy
0,1571551566693,-6.938171,-81.86188,-363.9801,-42.251587,-82.7652,-336.10382,3
1,1571551566713,-4.856873,-81.86188,-362.63885,-42.251587,-82.7652,-336.10382,3
2,1571551566733,-6.243896,-80.47485,-363.30872,-42.251587,-82.7652,-336.10382,3
3,1571551566753,-5.549622,-81.86188,-363.30872,-42.251587,-82.7652,-336.10382,3
4,1571551566773,-6.938171,-81.16913,-363.30872,-42.251587,-82.7652,-336.10382,3


rotation_vector


Unnamed: 0,timestamp,x,y,z,accuracy
0,1571551566693,-0.033759,0.04746,0.686108,3
1,1571551566713,-0.038705,0.053961,0.685716,3
2,1571551566733,-0.045757,0.061953,0.68655,3
3,1571551566753,-0.053042,0.069828,0.688253,3
4,1571551566773,-0.060194,0.075623,0.690391,3


waypoint


Unnamed: 0,timestamp,x,y
0,1571551566576,37.889812,154.43535
1,1571551573569,27.694906,153.9801
2,1571551581118,16.998966,153.33621


wifi


Unnamed: 0,timestamp,ssid,bssid,rssi,frequency,last_seen_timestamp
0,1571551568450,64f1125cb0e5e507a7ba5c32dd76bd506f30ce94,d2b9915dc73e4d333a718f8c02edae5e2a4d94f5,-52,5785,1571551567798
1,1571551568450,b01857e452a84ace381ac545896264c61ac82a57,5db8a385607a001cae8da5f069e1005f527ae7d6,-53,2437,1571551556712
2,1571551568450,da39a3ee5e6b4b0d3255bfef95601890afd80709,4c1ab193093f7057e6678f8f12f7ac4c05b95680,-53,5785,1571551567800
3,1571551568450,da39a3ee5e6b4b0d3255bfef95601890afd80709,f20391acb21826bb8f38243de772b7f3f8301f83,-53,5785,1571551567800
4,1571551568450,da39a3ee5e6b4b0d3255bfef95601890afd80709,45708a1205fbe53ae5ced9e450e0cedccf96e05a,-54,2437,1571551566923


---

## Create map for label encode

In [80]:
src_dir = pathlib.Path("../data/raw/train/")
filepaths = [
    path_filepath 
    for site_filepath in src_dir.glob("*") 
    for floor_filepath in site_filepath.glob("*") 
    for path_filepath in floor_filepath.glob("*")
]

In [109]:
def get_bssid_from_featureStore(filepath):
    site_id = filepath.parent.parent.name
    floor = filepath.parent.name
    path_id = filepath.name.split(".")[0]
    
    feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
    uniques = feature.wifi.bssid.unique()
    if len(uniques) > 0:
        return uniques
    else:
        return np.array([])

In [110]:
%%time
bssid = Parallel(n_jobs=-1)(delayed(get_bssid_from_featureStore)(filepath) for filepath in track(filepaths))

bssid = np.concatenate(bssid, axis=0)
unique_bsid = np.unique(bssid)

bssid_map = {_bssid: i + 1 for i, _bssid in enumerate(bssid)}
dump_pickle("./tmp/map_bssid.pkl", bssid_map)

Output()

CPU times: user 16.4 s, sys: 3.01 s, total: 19.4 s
Wall time: 1min 33s


---

## Simple feature engineering

### Create waypoint

In [147]:
src_dir = pathlib.Path("../data/raw/train/")
filepaths = [
    path_filepath 
    for site_filepath in src_dir.glob("*") 
    for floor_filepath in site_filepath.glob("*") 
    for path_filepath in floor_filepath.glob("*")
]

In [164]:
%%time
def create_waypoint(filepaths: List):
    def get_waypoint_from_featureStore(filepath):
        path_id = filepath.name.split(".")[0]

        feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
        wp = feature['waypoint']
        wp['site'] = feature.site_id
        wp['floor'] = feature.n_floor
        wp['path'] = feature.path_id
        if len(wp) > 0:
            return wp
        else:
            return pd.DataFrame([])

    waypoint = Parallel(n_jobs=-1)(delayed(get_waypoint_from_featureStore)(filepath) for filepath in track(filepaths))
    waypoint = pd.concat(waypoint, axis=0).reset_index(drop=True)
    waypoint = waypoint.sort_values(by=['path', 'timestamp']).reset_index(drop=True)
    return waypoint

waypoint = create_waypoint(filepaths)

Output()

CPU times: user 1min 11s, sys: 3.12 s, total: 1min 14s
Wall time: 1min 47s


In [190]:
dump_pickle('./tmp/train_waypoint.pkl', waypoint)

Dump pickle to ./tmp/train_waypoint.pkl


In [191]:
waypoint.head()

Unnamed: 0,timestamp,x,y,site,floor,path
0,1558318437984,59.747032,244.61037,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
1,1558318450619,55.36007,256.0339,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
2,1558318459913,49.93607,254.65387,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
3,1558318468827,44.06084,252.6999,5cd969ba39e2fc0b4afe6fae,0,5ce215bc2d50640008bf22e2
4,1558318550185,47.224,245.42503,5cd969ba39e2fc0b4afe6fae,0,5ce215be915519000851776a


### Create wifi 

In [16]:
%%time

def create_wifi():
    def get_wifi_feature(path_id, gdf):
        seq_len = 100
        bssid = []
        rssi = []
        freq = []

        feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
        wifi = feature.wifi.copy()
        wifi["bssid"] = wifi["bssid"].map(bssid_map)

        min_idx = gdf.index.min()
        max_idx = gdf.index.max()

        for i, row in gdf.iterrows():
            ts_pre_wp = gdf.loc[i - 1, "timestamp"] if i > min_idx else None
            ts_current_wp = gdf.loc[i, "timestamp"]
            ts_post_wp = gdf.loc[i + 1, "timestamp"] if (i + 1) < max_idx else None

            _wifi = wifi.copy()
            # NOTE: ターゲットとなるwaypointとその前後のwaypointの間にあるデータを取得する。
            ts_wifi = _wifi["timestamp"].values
            pre_flag = (
                np.ones(len(ts_wifi)).astype(bool)
                if ts_pre_wp == None
                else (ts_pre_wp < ts_wifi)
            )
            psot_flag = (
                np.ones(len(ts_wifi)).astype(bool)
                if ts_post_wp == None
                else (ts_wifi < ts_post_wp)
            )
            _wifi = _wifi[pre_flag & psot_flag]

            _wifi = _wifi.sort_values(by="rssi", ascending=False)
            _wifi = _wifi.head(seq_len)
            
            _bssid = np.zeros(seq_len)
            _rssi = np.tile(-999, seq_len)
            _freq = np.tile(-999, seq_len)
            
            _bssid[:len(_wifi)] = _wifi["bssid"].astype("int32").to_numpy()
            _rssi[:len(_wifi)] = _wifi["rssi"].astype("float32").to_numpy()
            _freq[:len(_wifi)] = _wifi["frequency"].astype("float32").to_numpy()
            
            bssid.append(_bssid)
            rssi.append(_rssi)
            freq.append(_freq)

        return bssid, rssi, freq

    waypoint = load_pickle("./tmp/train_waypoint.pkl", verbose=False)
    bssid_map = load_pickle("./tmp/map_bssid.pkl", verbose=False)
    results = Parallel(n_jobs=-1)(
        delayed(get_wifi_feature)(path_id, gdf)
        for path_id, gdf in track(waypoint.head(5000).groupby("path"))
    )
    return results

waypoint = load_pickle("./tmp/train_waypoint.pkl", verbose=False)
bssid_map = load_pickle("./tmp/map_bssid.pkl", verbose=False)

results = create_wifi()

Output()

In [19]:
bssid, rssi, freq = zip(*results)
bssid = np.concatenate(bssid, axis=0)
rssi = np.concatenate(rssi, axis=0)
freq = np.concatenate(freq, axis=0)

In [20]:
freq

array([[2437, 2412, 2437, ..., 5260, 5260, 2437],
       [2437, 2412, 2412, ..., 5260, 2462, 2412],
       [2412, 2412, 2412, ..., 2412, 2412, 2412],
       ...,
       [2437, 2437, 2437, ..., 2462, 2462, 2462],
       [2437, 2437, 2437, ..., 2462, 2462, 2412],
       [2437, 2462, 2462, ..., 2462, 2462, 2462]])