## local feature dataset for LSTMを作成

[WiFi Fingerprinting Indoor Localization Using LocalFeature-Based Deep LSTM](https://ieeexplore.ieee.org/abstract/document/8733822/)

In [1]:
!pip install dask
!python3 -m pip install "dask[distributed]" 

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [22]:
import pandas as pd
import numpy as np
import glob
import os
import gc
import json 
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import dask
from dask.distributed import Client, wait, LocalCluster

In [23]:
base_path = '../input/indoor-location-navigation/'

In [24]:
ssubm = pd.read_csv(base_path + 'sample_submission.csv')

ssubm_df = ssubm["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
used_buildings = sorted(ssubm_df[0].value_counts().index.tolist())

floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [25]:
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
def from_pickle(filename):
    with open(filename, mode='rb') as f:
        obj = pickle.load(f)
    return obj

In [8]:
TIME_DIFF_THRESHOLD = 5000 # 抽出対象とする最大時間（ミリ秒）
WINDOW_RANGE = 1000
# WIFI_NUM = 100 # WiFi抽出件数
WIFI_MIN_NUM = 10 # waypointに紐づくWiFiの最低数（この数以下の場合は学習データから除外する）

100%|██████████| 24/24 [20:15<00:00, 50.63s/it]

CPU times: user 19min 56s, sys: 9.71 s, total: 20min 5s
Wall time: 20min 15s





## site内で50回以上登場するwifiを抽出

In [26]:
# %%time

# # TODO 全体の登場回数ではなく、waypointの近くの登場回数で絞るべき
WIFI_USE_COUT = 50

bssid = dict()

for building in used_buildings:
    #break
    folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))
    print(building)
    wifi = list()
    for folder in folders:
        floor = floor_map[folder.split('/')[-1]]
        files = glob.glob(os.path.join(folder, "*.txt"))
        for file in files:
            with open(file, encoding='utf-8') as f:
                txt = f.readlines()
                for e, line in enumerate(txt):
                    tmp = line.strip().split()
                    if tmp[1] == "TYPE_WIFI":
                        wifi.append(tmp)
    df = pd.DataFrame(wifi)
    value_counts = df[3].value_counts()
    top_bssid = value_counts[value_counts > WIFI_USE_COUT].index.tolist() # 50回以上登場するBSSIDのみを対象にする。
    # print(len(top_bssid))
    bssid[building][floor] = top_bssid
    del df
    del wifi
    gc.collect()

In [27]:
# with open(f"../input/kuto_wifi_dataset_v4/bssid_{WIFI_USE_COUT}.json", "w") as f:
#     json.dump(bssid, f)

with open(f"../input/kuto_wifi_dataset_v4/bssid_{WIFI_USE_COUT}.json") as f:
    bssid = json.load(f)

## train dataの作成

In [9]:
all_train_df = pd.DataFrame(all_train_list)
all_train_df

Unnamed: 0,site,path,timestamp,file_name,floor,floor_str,x,y
0,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,114.33501,156.84224
1,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466149574,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,106.65901,154.62952
2,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466158395,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,102.16824,158.42908
3,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466166621,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,107.85044,161.89262
4,5a0546857ecc773753327266,5e1580bb1506f2000638fc62,1578466886458,5a0546857ecc773753327266_5e1580bb1506f2000638f...,-1,B1,41.316772,180.0171
...,...,...,...,...,...,...,...,...
74636,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892854685,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,117.17671,99.23578
74637,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892859436,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,122.26995,102.66496
74638,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892863738,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,126.63109,107.01164
74639,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892868972,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,131.79286,111.526085


In [8]:
%%time

DUMMY_RECORD = [9999999999999, 'DUMMY', 'NONE', 'NONE', -999, 0, 9999999999999, 99999, 99999] # waypointに紐づくWiFi数が100件に満たない場合に穴埋めするためのダミー

train_output_dir = f"../input/kuto_wifi_dataset_v4/train/{TIME_DIFF_THRESHOLD}_{WIFI_MIN_NUM}_{NUM_WINDOW}/"
os.makedirs(train_output_dir, exist_ok=True)

# train_save_file_name = f'train_{TIME_DIFF_THRESHOLD}_{WIFI_MIN_NUM}_{WIFI_USE_COUT}.pkl'
# test_save_file_name = f'test_{TIME_DIFF_THRESHOLD}_{WIFI_MIN_NUM}_{WIFI_USE_COUT}.pkl'

train_wifi_count = []
rows = []
all_train_list = []
for building in tqdm(used_buildings):
    bssids = bssid[building]
        
    for folder in sorted(glob.glob(os.path.join(base_path,'train', building +'/*'))):

        floor_str = folder.split('/')[-1]
        floor = floor_map[floor_str]
        
        for file in glob.glob(os.path.join(folder, "*.txt")):
            path = file.split('/')[-1].split('.')[0]
            wifi = list()
            waypoint = list()
            
            with open(file, encoding='utf-8') as f:
                txt = f.readlines()
                
            for line in txt:
                line = line.strip().split()
                if line[1] == "TYPE_WAYPOINT":
                    waypoint.append(line)
                if line[1] == "TYPE_WIFI":
                    wifi.append(line)

            if len(wifi) <= 0:
                continue
                    
            wifi_base_df = pd.DataFrame(np.array(wifi), columns=['timestamp', 'data_type', 'ssid', 'bssid', 'rssi', 'frequency', 'last_seen_timestamp'])

            for wp_row in waypoint:
                wifi_df = wifi_base_df.copy()
                
                wifi_df = wifi_df[wifi_df['bssid'].isin(bssids)]
                
                wp_timestamp = wp_row[0]

                wifi_df['time_diff'] = wifi_df['last_seen_timestamp'].astype(np.float) - int(wp_timestamp)
                wifi_df['time_diff_abs'] = wifi_df['time_diff'].abs()
                
                # waypointに近いWiFi(last_seen_timestamp)のみ抽出
                wifi_df = wifi_df[np.abs(wifi_df['time_diff']) < TIME_DIFF_THRESHOLD] 
      
                # 同一 SSID * BSSIDのうちtime_diffが一番小さいもののみを抽出する
#                 wifi_df = wifi_df.sort_values('time_diff_abs')
#                 wifi_df = wifi_df.groupby(['ssid', 'bssid']).head(1).reset_index(drop=True)

                wifi_count = len(wifi_df)
                # train_wifi_count.append(wifi_count)
                
                # WiFiが最低件数に満たない場合はスキップする
                if wifi_count < WIFI_MIN_NUM:
                    continue
                
                # WiFi件数に満たない行を埋める
#                 dummy_count = WIFI_NUM - wifi_count
#                 if dummy_count > 0:
#                     dummy_df = pd.DataFrame(np.tile(DUMMY_RECORD, (dummy_count, 1)), 
#                                             columns=['timestamp', 'data_type', 'ssid', 'bssid', 'rssi', 'frequency', 'last_seen_timestamp', 'time_diff', 'time_diff_abs'])
#                     wifi_df = pd.concat([wifi_df, dummy_df])
                    
                # wifi_df = wifi_df.head(100)
                
#                 row = np.concatenate([wifi_df['ssid'].to_numpy(),
#                                       wifi_df['bssid'].to_numpy(), 
#                                       wifi_df['rssi'].to_numpy(), 
#                                       wifi_df['frequency'].to_numpy(), 
#                                       wifi_df['time_diff'].to_numpy(),
#                                       [wp_row[0], # waypoint timestamp
#                                       wp_row[2], # x
#                                       wp_row[3], # y
#                                       floor, # floor number
#                                       floor_str, # floor string
#                                       file.split('/')[-1].split('.')[0], # path_id
#                                       building, # site_id
#                                       wifi_count]
#                                      ])
                # rows.append(row)

                # 以下追加
                col = [i for i in range(-TIME_DIFF_THRESHOLD, TIME_DIFF_THRESHOLD, NUM_WINDOW)]
                wifi_df["time_cat"] = pd.cut(wifi_df['time_diff'], col, labels=col[1:])
                wifi_df["rssi"] = wifi_df["rssi"].astype(int)
                sample_df = wifi_df.groupby(["bssid", "time_cat"])["rssi"].mean().reset_index().pivot(index="bssid", columns="time_cat", values="rssi").reindex(index=bssids, columns=col[1:])

                # sampleを保存
                file_name = f'{building}_{path}_{wp_row[0]}'

                np.save(train_output_dir + file_name, sample_df.values)
                sample_dict = {'site':building, 'path':path, 'timestamp':wp_row[0], 'file_name':file_name, 'floor':floor, 'floor_str':floor_str, 'x':wp_row[2], 'y':wp_row[3]}
                all_train_list.append(sample_dict)

100%|██████████| 24/24 [20:15<00:00, 50.63s/it]

CPU times: user 19min 56s, sys: 9.71 s, total: 20min 5s
Wall time: 20min 15s





In [10]:
all_train_df.to_csv(train_output_dir + 'train.csv',index=False)

In [11]:
# sampleを取り出す
sample_file_name = all_train_df['file_name'].sample(1).values[0]
sample_file_path = train_output_dir + sample_file_name + '.npy'
sample = np.load(sample_file_path)
sample.shape

(2838, 9)

In [12]:
sample

array([[ nan,  nan,  nan, ...,  nan,  nan, -91.],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan, -91.],
       ...,
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

## test dataの作成  
※ testのpathファイルにWiFiが存在しないデータはtimestampが不明であるため除外する  

In [28]:
realtime_sample_submission_df = pd.read_csv('real_timestamp_sample_submission_v2.csv')
test_len = len(realtime_sample_submission_df)

realtime_sample_submission_df = realtime_sample_submission_df[realtime_sample_submission_df['is_real_timestamp']]
test_len - len(realtime_sample_submission_df)

493

493件は別のデータセットで学習したモデルで予測する必要あり。

In [29]:
site_path_time_df = realtime_sample_submission_df['site_path_timestamp'].str.split('_').apply(lambda x: pd.Series(x))
site_path_time_df.columns = ['site_id', 'path_id', 'waypoint_timestamp']
site_path_time_df['site_path_timestamp'] = realtime_sample_submission_df['site_path_timestamp']
site_path_time_df.head(3)

Unnamed: 0,site_id,path_id,waypoint_timestamp,site_path_timestamp
0,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05,1573190310863,5da1389e4db8ce0c98bd0547_00ff0c9a71cc37a2ebdd0...
1,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05,1573190314901,5da1389e4db8ce0c98bd0547_00ff0c9a71cc37a2ebdd0...
2,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05,1573190323379,5da1389e4db8ce0c98bd0547_00ff0c9a71cc37a2ebdd0...


In [30]:
def create_wifi_df(path_id):
    wifi = list()
    
    with open(os.path.join(base_path + f'/test/{path_id}.txt'), encoding='utf-8') as f:
        txt = f.readlines()

    for line in txt:
        line = line.strip().split()
        if line[1] == "TYPE_WIFI":
            wifi.append(line)
            
    return pd.DataFrame(np.array(wifi), columns=['timestamp', 'data_type', 'ssid', 'bssid', 'rssi', 'frequency', 'last_seen_timestamp'])

In [31]:
%%time


test_output_dir = f"../input/kuto_wifi_dataset_v4/test/{TIME_DIFF_THRESHOLD}_{WIFI_MIN_NUM}_{NUM_WINDOW}/"
os.makedirs(test_output_dir, exist_ok=True)

test_wifi_count = []
# test_rows = []
all_test_list = []
for path_id, path_df in tqdm(site_path_time_df.groupby('path_id')):
    
    site_id = path_df.iloc[0]['site_id']
    bssids = bssid[site_id]
    
    wifi_base_df = create_wifi_df(path_id)
    wifi_base_df = wifi_base_df[wifi_base_df['bssid'].isin(bssids)] # 指定の件数以下の登場回数のWiFiは除外
    
    for i, path_row in path_df.iterrows():
        wifi_df = wifi_base_df.copy()
        site_path_timestamp = path_row['site_path_timestamp']
        waypoint_timestamp = path_row['waypoint_timestamp']
        
        wifi_df['time_diff'] = wifi_df['last_seen_timestamp'].astype(np.float) - int(waypoint_timestamp)
        wifi_df['time_diff_abs'] = wifi_df['time_diff'].abs()
        
        wifi_df = wifi_df[wifi_df['time_diff_abs'] < TIME_DIFF_THRESHOLD]
        

                    
#         wifi_df = wifi_df.sort_values('time_diff_abs')
#         wifi_df = wifi_df.groupby(['ssid', 'bssid']).head(1).reset_index(drop=True)
        
        wifi_count = len(wifi_df)

        # WiFiが最低件数に満たない場合はスキップする  
        # wifiが0件のものがあるのはなぜ？
        if wifi_count < WIFI_MIN_NUM:
            print(path_id, waypoint_timestamp)
            continue

#         test_wifi_count.append(wifi_count)
        
#         dummy_count = WIFI_NUM - wifi_count
#         if dummy_count > 0:
#             dummy_df = pd.DataFrame(np.tile(DUMMY_RECORD, (dummy_count, 1)), 
#                                     columns=['timestamp', 'data_type', 'ssid', 'bssid', 'rssi', 'frequency', 'last_seen_timestamp', 'time_diff', 'time_diff_abs'])

#         wifi_df = pd.concat([wifi_df, dummy_df]).head(WIFI_NUM)

#         test_row = np.concatenate([wifi_df['ssid'].to_numpy(),
#                               wifi_df['bssid'].to_numpy(), 
#                               wifi_df['rssi'].to_numpy(), 
#                               wifi_df['frequency'].to_numpy(), 
#                               wifi_df['time_diff'].to_numpy(),
#                               [waypoint_timestamp, # waypoint timestamp
#                               path_id, # path_id
#                               building, # site_id
#                               site_path_timestamp,
#                               wifi_count]
#                              ])
#         test_rows.append(test_row)

        # 以下追加
        col = [i for i in range(-TIME_DIFF_THRESHOLD, TIME_DIFF_THRESHOLD, int(TIME_DIFF_THRESHOLD/10))]
        wifi_df["time_cat"] = pd.cut(wifi_df['time_diff'], col, labels=col[1:])
        wifi_df["rssi"] = wifi_df["rssi"].astype(int)
        sample_df = wifi_df.groupby(["bssid", "time_cat"])["rssi"].mean().reset_index().pivot(index="bssid", columns="time_cat", values="rssi").reindex(index=bssids, columns=col[1:])

        # sampleを保存
        file_name = f'{building}_{path}_{waypoint_timestamp}'
        np.save(test_output_dir + file_name, sample_df.values)
        sample_dict = {'site':building, 'path':path, 'timestamp':wp_row[0], 'file_name':file_name}
        all_test_list.append(sample_dict)

 15%|█▍        | 86/592 [00:26<03:45,  2.24it/s]

2b4bacedc942ffcb523ff20f 1573792112944
2b4bacedc942ffcb523ff20f 1573792131949
2b4bacedc942ffcb523ff20f 1573792139960


 26%|██▋       | 156/592 [00:48<01:23,  5.22it/s]

472be94f5be907c04c932114 1571300537352


 28%|██▊       | 163/592 [00:51<02:47,  2.56it/s]

4b38e02c59253ce02b52eec5 1573196944334


 70%|███████   | 415/592 [02:10<00:33,  5.25it/s]

b406c5c925f3b64d8972b2c0 1573704597292


 93%|█████████▎| 549/592 [02:53<00:09,  4.43it/s]

eca017e97a25f3e4832fdb91 1573273369889


 99%|█████████▉| 586/592 [03:04<00:01,  4.34it/s]

fc99dca042a1ed67dc80a0fe 1573704836058


100%|██████████| 592/592 [03:06<00:00,  3.18it/s]

CPU times: user 3min 3s, sys: 2.49 s, total: 3min 6s
Wall time: 3min 6s





In [32]:
test_df = pd.DataFrame(all_test_list)
test_df

Unnamed: 0,site,path,timestamp,file_name
0,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
1,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
2,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
3,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
4,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
...,...,...,...,...
9627,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
9628,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
9629,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...
9630,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892874306,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...


In [33]:
sample_df.notnull().sum(axis=1).value_counts()

0    2710
1      62
3      39
2      27
dtype: int64

In [34]:
test_df.to_csv(test_output_dir + 'test.csv', index=False)

In [35]:
# sampleを取り出す
sample_file_name = test_df['file_name'].sample(1).values[0]
sample_file_path = test_output_dir + sample_file_name + '.npy'
sample = np.load(sample_file_path)
sample.shape

(2936, 19)

In [36]:
sample

array([[ nan,  nan, -84., ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       ...,
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan]])