In [1]:

import sys
import pathlib
import numpy as np
import pandas as pd
from rich.progress import track
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
%matplotlib inline


import torch
from torch.utils.data import Dataset, DataLoader

sys.path.append("../src")
from utils.feature import FeatureStore
from utils.common import load_pickle, dump_pickle, save_cache

In [2]:
featfure_dir = pathlib.Path("../data/preprocessing/")

scaler = load_pickle(f"../data/scaler/scaler_wifi_rssi.pkl")
wifi_bssid = np.load(featfure_dir / "train_wifi_bssid.npy")
test_wifi_bssid = np.load("../data/submit/test_wifi_bssid.npy")


Load pickle from ../data/scaler/scaler_wifi_rssi.pkl


BSSID のラベルエンコーディングは本当に任意の処理を行えているのかを確認する

- 前提
    - Trainに存在するbssidはtestに存在するものをすべて含んでいる
- 処理内容
    - train の bssid をすべて取得する
    - bssid の map を作成する
    - 前処理の途中でlabel encodeを行う

## 前提の確認

In [3]:
@save_cache("./tmp/train_unique_bssid.pkl", True)
def create_bssid_map(filepaths):
    def get_bssid_from_featureStore(filepath):
        site_id = filepath.parent.parent.name
        floor = filepath.parent.name
        path_id = filepath.name.split(".")[0]

        feature = load_pickle(f"../data/working/{path_id}.pkl", verbose=False)
        uniques = feature.wifi.bssid.unique()
        if len(uniques) > 0:
            return uniques
        else:
            return np.array([])

    bssid = Parallel(n_jobs=-1)(
        delayed(get_bssid_from_featureStore)(filepath) for filepath in track(filepaths)
    )
    bssid = np.concatenate(bssid, axis=0)
    unique_bssid = np.unique(bssid)
    return unique_bssid

In [4]:
src_dir = pathlib.Path("../data/raw/train/")
filepaths = [
    path_filepath
    for site_filepath in src_dir.glob("*")
    for floor_filepath in site_filepath.glob("*")
    for path_filepath in floor_filepath.glob("*")
]

train_uniques = create_bssid_map(filepaths)

Load pickle from ./tmp/train_unique_bssid.pkl


In [5]:
@save_cache("./tmp/test_unique_bssid.pkl", True)
def create_test_bssid(filepaths):
    def get_bssid_from_featureStore(filepath):
        site_id = filepath.parent.parent.name
        floor = filepath.parent.name
        path_id = filepath.name.split(".")[0]

        feature = load_pickle(f"../data/submit/path_data/{path_id}.pkl", verbose=False)
        uniques = feature.wifi.bssid.unique()
        if len(uniques) > 0:
            return uniques
        else:
            return np.array([])

    bssid = Parallel(n_jobs=-1)(
        delayed(get_bssid_from_featureStore)(filepath) for filepath in track(filepaths)
    )
    bssid = np.concatenate(bssid, axis=0)
    unique_bssid = np.unique(bssid)
    return unique_bssid

In [6]:
src_dir = pathlib.Path("../data/submit/path_data/")
filepaths = [path_filepath for path_filepath in src_dir.glob("*")]

test_uniques = create_test_bssid(filepaths)

Load pickle from ./tmp/test_unique_bssid.pkl


In [20]:
filepaths[0]

PosixPath('../data/submit/path_data/504b8655852f837f2aca36a7.pkl')

In [7]:
train_uniques

array(['00001d7b6fbf0a24da65285b686b03c6e796962a',
       '0000fe40d201cfc6cada502b07f29883cd17fe4a',
       '0001092dd27fe270ab0e2a652e21ea6e8320bf33', ...,
       'ffffae79ecb8e184afbaf0f6f763ebf0bc2d49df',
       'ffffb8116ceb5c0326ec2eb039028ec71ffdfbab',
       'ffffcb329c1c354fb4290a8223fd99d6508ed766'], dtype=object)

In [8]:
len(set(list(train_uniques)))

237452

In [9]:
len(set(list(test_uniques)))

37779

In [10]:
print('testにのみ存在するBSSIDの数：')
len(set(list(test_uniques)) - set(list(train_uniques)))

testにのみ存在するBSSIDの数：


1407

前提が間違っていた