In [1]:
import pandas as pd
import numpy as np
import glob
import os
import gc
import json
from tqdm.notebook import tqdm

import sys 
sys.path.append("../")
from src.io_f import read_data_file

from scipy import interpolate

In [2]:
base_path = '../input/indoor-location-navigation/'

In [3]:
# pull out all the buildings actually used in the test set, given current method we don't need the other ones
ssubm = pd.read_csv(base_path + 'sample_submission.csv')

# only 24 of the total buildings are used in the test set, 
# this allows us to greatly reduce the intial size of the dataset
ssubm_df = ssubm["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
ssubm_df = ssubm_df.rename(columns={0:'site', 1:'path', 2:'timestamp'})
ssubm_df['timestamp'] = ssubm_df['timestamp'].astype(int)
used_buildings = sorted(ssubm_df['site'].value_counts().index.tolist())

# dictionary used to map the floor codes to the values used in the submission file. 
floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [4]:
# # get only the wifi bssid that occur over 1000 times(this number can be experimented with)
# # these will be the only ones used when constructing features
# bssid = dict()
# for building in tqdm(used_buildings):
#     folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))
#     wifi = list()
#     for folder in folders:
#         floor = floor_map[folder.split('/')[-1]]
#         files = glob.glob(os.path.join(folder, "*.txt"))
#         for file in files:
#             with open(file, encoding='utf-8') as f:
#                 txt = f.readlines()
#                 for e, line in enumerate(txt):
#                     tmp = line.strip().split()
#                     if tmp[1] == "TYPE_WIFI":
#                         wifi.append(tmp)
#     df = pd.DataFrame(wifi)
#     #top_bssid = df[3].value_counts().iloc[:500].index.tolist()
#     value_counts = df[3].value_counts()
#     top_bssid = value_counts[value_counts > 100].index.tolist()
#     # print(len(top_bssid))
#     bssid[building] = top_bssid
#     del df
#     del wifi
#     gc.collect()

In [5]:
# with open("bssid_1000.json", "w") as f:
#     json.dump(bssid, f)

In [6]:
with open("bssid_1000.json") as f:
    all_bssid = json.load(f)

In [7]:
def wifi_waypoint_by_linear_interpolation(
        trajectory_timestamp, wifi_timestamp,
        x_observed, y_observed, delta_time=500,
        ): 
    """
    arg: wifi_timestampの数
    return: wifi_waypointの数
    これらが必ず一致する
    """
    wifi_waypoint_list = []
    num_interpolation = len(trajectory_timestamp) - 1  # 補完回数 
    # 各waypoint間で線形補完
    for i in range(num_interpolation):
        # 潜在的なtimestampを作成(これのどれかにwifiを当てはめるような形)
        n_split = int((max(trajectory_timestamp[i:i+2]) - min(trajectory_timestamp[i:i+2])) / delta_time) + 2  # delta_time刻みとなるように分割数を指定(+2は始点と終点分)
        timestamp_latent = np.linspace(min(trajectory_timestamp[i:i+2]), max(trajectory_timestamp[i:i+2]), n_split).astype(int) 
        
        # xが昇順の場合はlatentも昇順になるようにする
        if x_observed[i] < x_observed[i+1]:
            x_latent = np.linspace(min(x_observed[i:i+2]), max(x_observed[i:i+2]), n_split)
        # xが降順の場合はlatentも降順になるようにする
        else:
            x_latent = np.linspace(min(x_observed[i:i+2]), max(x_observed[i:i+2]), n_split)[::-1]
        
        # 線形補完関数の適用
        fitted_curve = interpolate.interp1d(x_observed[i:i+2], y_observed[i:i+2])

        # wifiのtimestampに最も近いものをsplit_timestampから取得しそれに対応するwaypointをwifiのwaypointとして取得
        wifi_x = []
        wifi_y = []

        # 区間内のwifiデータのみ考える
        if i == num_interpolation-1:  # 最後の補完の場合
            # 最後のwaypointより未来のwifiデータがあるようなのでこの処理を追加
            target_idx = min(trajectory_timestamp[i:i+2]) <= wifi_timestamp
        else:
            target_idx = (min(trajectory_timestamp[i:i+2]) <= wifi_timestamp) & (wifi_timestamp < max(trajectory_timestamp[i:i+2]))

        target_wifi_timestamp = wifi_timestamp[target_idx]

        # timestampが最も近いものをwifiのwaypointとして取得
        for t in target_wifi_timestamp:
            idx = np.abs(timestamp_latent - t).argmin()  # wifiデータとtimestampが最も近いものをlatentから取得
            wifi_x.append(x_latent[idx])
            wifi_y.append(fitted_curve(x_latent[idx]))

        wifi_waypoint = np.stack([wifi_x, wifi_y], axis=1)
        wifi_waypoint_list.append(wifi_waypoint)

    wifi_waypoint = np.concatenate(wifi_waypoint_list)
    return wifi_waypoint


def get_wifi_waypoint(site, floor, path, wifi_timestamp):

    path_file = f'../input/indoor-location-navigation/train/{site}/{floor}/{path}.txt'
    example = read_data_file(path_file)

    # waypoint
    trajectory = example.waypoint
    trajectory_timestamp = trajectory[:,0]
    trajectory_waypoint = trajectory[:, 1:]
    x_observed = trajectory_waypoint[:, 0]
    y_observed = trajectory_waypoint[:, 1]

    # 線形補完
    wifi_waypoint = wifi_waypoint_by_linear_interpolation(trajectory_timestamp, wifi_timestamp, x_observed, y_observed) 
    return wifi_waypoint


def grouping_last_seen_timestamp(df):
    # last seen timestampでグループを作成する 
    bins = int((df['last_seen_timestamp'].max() - df['last_seen_timestamp'].min()) / BIN_TIME)
    if bins == 0:
        df['last_seen_timestamp'] = int(df['last_seen_timestamp'].mean())  # まとめたもので上書き
    else:
        s_cut = pd.cut(df['last_seen_timestamp'], bins, labels=[str(i) for i in range(bins)])
        df['group'] = s_cut
        gr = df.groupby('group')
        df['last_seen_timestamp'] = gr.transform('mean')['last_seen_timestamp']  # まとめたもので上書き
        df['last_seen_timestamp'] = df['last_seen_timestamp'].astype(int)
    return df

In [8]:
TIME_DIFF_THR = 5000  # 5s以上ズレている場合は消す
BIN_TIME = 3000  # 3sごとにlast seenの塊を作る
NUM_WIFI = 50   # 揃えるwifiデータ数

In [9]:
output_dir = '../input/kuto_wifi_dataset_v4/'
os.makedirs(output_dir, exist_ok=True)

In [10]:
def create_train_dataset(building):
    folders = sorted(glob.glob(os.path.join(base_path,'train', building +'/*')))
    building_dfs = list()
    index = sorted(all_bssid[building])
    print(building)
    for i, folder in enumerate(tqdm(folders)):
        floor_str = folder.split('/')[-1]
        floor = floor_map[folder.split('/')[-1]]
        
        files = glob.glob(os.path.join(folder, "*.txt"))

        # path内の処理
        for file in files:
            wifi = list()
            waypoint = list()
   
            with open(file, encoding='utf-8') as f:
                txt = f.readlines()
            for line in txt:
                line = line.strip().split()
                if line[1] == "TYPE_WAYPOINT":
                    waypoint.append(line)
                if line[1] == "TYPE_WIFI":
                    wifi.append(line)

            # wifiデータが1つもない場合は飛ばす
            if len(wifi) == 0:
                print(f'wifiデータがないpath:{file}')
                break

            df = pd.DataFrame(np.array(wifi)) 
            df = df.rename(columns={0:'timestamp', 1:'type_wifi', 2:'ssid', 3:'bssid', 4:'rssi', 5:'freq', 6:'last_seen_timestamp'})
            df['timestamp'] = df['timestamp'].astype(int)
            df['last_seen_timestamp'] = df['last_seen_timestamp'].astype(int)
            
            # last seen timestampとtimestampの差が小さいもののみ使用
            df = df[(df['last_seen_timestamp'] - df['timestamp']).abs() < TIME_DIFF_THR].reset_index(drop=True)

            # last seen timestampで塊を作る
            df = grouping_last_seen_timestamp(df)

            # last seenのgroupごとに処理
            dfs = []
            for last_seen_timestamp, g in df.groupby('last_seen_timestamp'):
                dists = list()
                # waypointのtimestampに最も近いwifiのindexを取得
                for e, k in enumerate(waypoint):
                    dist = abs(int(last_seen_timestamp) - int(k[0]))  # last_seen_timestamp - waypointのtimestamp
                    dists.append(dist)
                nearest_wp_index = np.argmin(dists)
                
                g = g.drop_duplicates(subset='bssid')  # last seenでまとめた時のbssidの被りを削除  TODO rssiは平均を取った方がいい？
    
                tmp = g.loc[:,'bssid':"rssi"]
                feat = tmp.set_index('bssid').reindex(index).replace(np.nan, -999).T
                # last seen timestampに最も近いwaypointのtimestamp,x,y
                feat["timestamp"] = str(waypoint[nearest_wp_index][0])  
                feat["x"] = float(waypoint[nearest_wp_index][2])
                feat["y"] = float(waypoint[nearest_wp_index][3])
                feat["f"] = floor
                feat["floor_str"] = floor_str
                feat["path"] = file.split('/')[-1].split('.')[0] # useful for crossvalidation
                feat["last_seen_timestamp"] = last_seen_timestamp
                feat["time_diff"] = last_seen_timestamp - int(waypoint[nearest_wp_index][0])  # last seen timestampとwaypointのtime_diff
                dfs.append(feat)

            path_df = pd.concat(dfs).reset_index(drop=True)

            # pathのstart地点のtimestampより過去のwifiデータは使用しない(要検討)
            start_timestamp = int(waypoint[0][0])
            path_df = path_df[path_df['last_seen_timestamp'] > int(waypoint[0][0])].reset_index(drop=True)

            bssid = path_df.columns[:-8].values
            rssi = path_df.iloc[:, :-8].astype(int).values
            targets = path_df.iloc[:,-8:].values

            # wifiのデータ数がNUM_WIFI以上のlast seen timestampのみ使用 
            use_idx = ((path_df.iloc[:, :-8] != -999).sum(axis=1) > NUM_WIFI).values
            targets = targets[use_idx]
            path_df = path_df[use_idx]

            sort_rssi = []
            sort_bssid = []
            for i in range(len(path_df)):
                sort_rssi.append(np.sort(rssi[i])[::-1][:NUM_WIFI])
                idx = np.argsort(rssi[i])[::-1]
                sort_bssid.append(bssid[idx][:NUM_WIFI])
            if len(sort_bssid) == 0:
                # path内でlast seen timestampごとにまとめたときにNUM_WIFI個以上のwifiデータをもつlast seen timestampが1つもない場合
                print(f'使用しないpath:{file}')
                break
            else:
                sort_rssi = np.stack(sort_rssi)
                sort_bssid = np.stack(sort_bssid)

            path = path_df['path'].unique()[0]
            timestamp = path_df['last_seen_timestamp'].values
            # # last seen timestamp(塊)に対応するwaypointを算出したい
            wifi_waypoint = get_wifi_waypoint(site=building, floor=floor_str, path=path, wifi_timestamp=timestamp)  # 1行ずつ処理するのは計算コスト的にもったいない
            path_data = np.concatenate((sort_bssid, sort_rssi, targets, wifi_waypoint), axis=1)
            building_dfs.append(pd.DataFrame(path_data))

    building_df = pd.concat(building_dfs)
    columns = [f'bssid_{str(i)}' for i in range(NUM_WIFI)] + [f'rssi_{str(i)}' for i in range(NUM_WIFI)] + \
            ['timestamp', 'x', 'y', 'floor', 'floor_str', 'path', 'last_seen_timestamp', 'time_diff', 'wifi_x', 'wifi_y']
    building_df.columns = columns
    building_df.to_csv(output_dir + f"{building}_wifi{NUM_WIFI}_bin{BIN_TIME}_train.csv", index=False)

In [11]:
from multiprocessing import Pool
with Pool(processes=24) as pool:
    pool.map(create_train_dataset, used_buildings)

5a0546857ecc7737533272665c3c44b80379370013e0fd2b5d2709b303f801723c327472
5d27097f03f801723c320d975d2709bb03f801723c32852c5da138274db8ce0c98bbd3d25d27096c03f801723c31e5e05d27075f03f801723c2e360f5d2709d403f801723c32bd39
5da138314db8ce0c98bbf3a05da1389e4db8ce0c98bd05475d2709e003f801723c32d8965da138364db8ce0c98bc00f15da1382d4db8ce0c98bbe92e5da138754db8ce0c98bca82f5da138764db8ce0c98bcaa46



5da1383b4db8ce0c98bc11ab5d2709c303f801723c3299ee5da138b74db8ce0c98bd4774

5da958dd46f8266d0737457b


5dc8cea7659e181adb076a3f5dbc1d84c1eb61796cf7c010










5d27099f03f801723c32511d

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]




  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

5d2709a003f801723c3251bf


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

使用しないpath:../input/indoor-location-navigation/train/5d27097f03f801723c320d97/B1/5dd0f21c94e4900006126045.txt
使用しないpath:../input/indoor-location-navigation/train/5da138274db8ce0c98bbd3d2/F1/5dc65d2b17ffdd0006f114dd.txt
使用しないpath:../input/indoor-location-navigation/train/5d2709e003f801723c32d896/B1/5dc78fce1cda3700060310f9.txt
使用しないpath:../input/indoor-location-navigation/train/5da1389e4db8ce0c98bd0547/B1/5dc52ae621dceb00061148b1.txt使用しないpath:../input/indoor-location-navigation/train/5da138274db8ce0c98bbd3d2/F2/5dc521041cda37000602f96b.txt

使用しないpath:../input/indoor-location-navigation/train/5da138274db8ce0c98bbd3d2/F3/5dc5072a21dceb00061145ca.txt
使用しないpath:../input/indoor-location-navigation/train/5d2709bb03f801723c32852c/B1/5dca36345e083300061bc838.txt
使用しないpath:../input/indoor-location-navigation/train/5d27097f03f801723c320d97/B2/5dd280f3878f3300066c836a.txt
使用しないpath:../input/indoor-location-navigation/train/5da1389e4db8ce0c98bd0547/B2/5dc518fd171e610006b5dad6.txt
使用しないpath:../input/

ValueError: No objects to concatenate

In [17]:
all_df = []
for building in used_buildings:
    df = pd.read_csv(output_dir + building + f'_wifi{NUM_WIFI}_bin{BIN_TIME}_train.csv')
    df['site_id'] = building
    all_df.append(df)
all_df = pd.concat(all_df).reset_index(drop=True)
all_df['last_seen_timestamp'] = all_df['last_seen_timestamp'].astype(int)
all_df



Unnamed: 0,bssid_0,bssid_1,bssid_2,bssid_3,bssid_4,bssid_5,bssid_6,bssid_7,bssid_8,bssid_9,...,x,y,floor,floor_str,path,last_seen_timestamp,time_diff,wifi_x,wifi_y,site_id
0,db01605eac3f33540038bd9722aba25774871d43,93e20595eeef175d3aa3c3381f6a22ee792d48d9,0b64e537cc3d1818ec46f94f8dc14043a98d0089,dc4c46287575c45f3e32c022d868d047b485ed4c,8c936564ea4b4300576f53136505527eb5972c07,3f564032c7eebc173b38aee35225e323d4389faf,ce28608c3d091ac0d25d84459ebad253edf83e1f,46c934893439700099d03a6892ea934ecb2729d6,3c89886dd08bb4a24384cf8bc0c6423e4804e6e9,b51107fb094a5127ad76ef502ac03632a8f0d67c,...,114.33501,156.84224,-1,B1,5e1580d1f4c3420006d520e4,1578466133383,605,114.109245,156.777160,5a0546857ecc773753327266
1,db01605eac3f33540038bd9722aba25774871d43,965f254a2e8d05bbb40bd2413ff61de3ad6c4151,3f564032c7eebc173b38aee35225e323d4389faf,0b64e537cc3d1818ec46f94f8dc14043a98d0089,ae54f8552a572ddf81302b56c07f63c6321270be,16374260af7d03b10f167358a4f6a70620e131f4,f26678bbbbd078e242638a0d1fb5ba2e61262f4c,5c10b343d767a30515e6015de25751a2883328f8,46c934893439700099d03a6892ea934ecb2729d6,3c89886dd08bb4a24384cf8bc0c6423e4804e6e9,...,114.33501,156.84224,-1,B1,5e1580d1f4c3420006d520e4,1578466135394,2616,113.206186,156.516840,5a0546857ecc773753327266
2,965f254a2e8d05bbb40bd2413ff61de3ad6c4151,db01605eac3f33540038bd9722aba25774871d43,1f37bbb3f42125f665b83584d0376b21ec3eb43c,3f564032c7eebc173b38aee35225e323d4389faf,61c3aaf1a526f808c05952ea3f098e37354a674a,b2b0ddbb5a2aadfc6ab2f388db584b6c280d3f82,8c936564ea4b4300576f53136505527eb5972c07,599fa96d549ed870671d6bc1927aaa8bbaacca12,16374260af7d03b10f167358a4f6a70620e131f4,4c83a7a1e51bfa8a5fa20e854ab3feec057c52c9,...,114.33501,156.84224,-1,B1,5e1580d1f4c3420006d520e4,1578466137513,4735,112.077363,156.191440,5a0546857ecc773753327266
3,db01605eac3f33540038bd9722aba25774871d43,965f254a2e8d05bbb40bd2413ff61de3ad6c4151,0b64e537cc3d1818ec46f94f8dc14043a98d0089,71d97fb3d6f464d4c26fd061732e58398d053b2e,db85cb1371261046f2b711ac8dcceeb06b7ca724,599fa96d549ed870671d6bc1927aaa8bbaacca12,5944b636243a99749c2114bf57072e1505801e26,46c934893439700099d03a6892ea934ecb2729d6,3c89886dd08bb4a24384cf8bc0c6423e4804e6e9,16374260af7d03b10f167358a4f6a70620e131f4,...,114.33501,156.84224,-1,B1,5e1580d1f4c3420006d520e4,1578466139438,6660,111.400069,155.996200,5a0546857ecc773753327266
4,db01605eac3f33540038bd9722aba25774871d43,0b64e537cc3d1818ec46f94f8dc14043a98d0089,965f254a2e8d05bbb40bd2413ff61de3ad6c4151,3f564032c7eebc173b38aee35225e323d4389faf,b742b3fa0287399e647252d184ee8d58b67c05ae,5c10b343d767a30515e6015de25751a2883328f8,1f37bbb3f42125f665b83584d0376b21ec3eb43c,3c89886dd08bb4a24384cf8bc0c6423e4804e6e9,46c934893439700099d03a6892ea934ecb2729d6,16374260af7d03b10f167358a4f6a70620e131f4,...,106.65901,154.62952,-1,B1,5e1580d1f4c3420006d520e4,1578466141519,-8055,110.271245,155.670800,5a0546857ecc773753327266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164334,cce41299a022ada08aebf3d309acb07d5f00b014,d090a2f7f222fadeeb64e4fbdfe1ca8451116b04,4b5dbdb52b131410ea10b59ea451de62280b41d6,5964a27e0cb3344b0a18540e6b3120c433971c38,346b34a42e801c64e043dbaacbe7fef9b8880774,21310f6a93112e4cb928817e3af33ebb1bb62875,ca69ae425b53d4c2fae3d97ec4ec61897a4a6b73,566e0c6e3bcf2b8b3d310d96f111043d17ace817,f4107af4418d57aacb3542343f7b47768debdc75,bd3fc24710537130e97dc2dab4a6bf70b3884a8b,...,191.95721,143.86253,6,F7,5dcd5c94a4dbe7000630b08e,1573733081235,2225,194.181797,141.904314,5dc8cea7659e181adb076a3f
164335,5964a27e0cb3344b0a18540e6b3120c433971c38,cce41299a022ada08aebf3d309acb07d5f00b014,d090a2f7f222fadeeb64e4fbdfe1ca8451116b04,346b34a42e801c64e043dbaacbe7fef9b8880774,ca69ae425b53d4c2fae3d97ec4ec61897a4a6b73,f4107af4418d57aacb3542343f7b47768debdc75,04ac5e0b34d9646b79ad606e53c1b9b95e526dd7,4aa0b7ce4c79dd7e6c76fb06a25510068d66fbd0,180a351ec58c07d60949862c534373c43f548a9a,4d2e5639041b40b0df2ee258aa504bd904133d80,...,200.41064,136.42131,6,F7,5dcd5c94a4dbe7000630b08e,1573733083592,-4545,196.406384,139.946098,5dc8cea7659e181adb076a3f
164336,5964a27e0cb3344b0a18540e6b3120c433971c38,d090a2f7f222fadeeb64e4fbdfe1ca8451116b04,cce41299a022ada08aebf3d309acb07d5f00b014,346b34a42e801c64e043dbaacbe7fef9b8880774,180a351ec58c07d60949862c534373c43f548a9a,4d2e5639041b40b0df2ee258aa504bd904133d80,89395d0ee75307b3beb30aef2f19fc680095d514,04ac5e0b34d9646b79ad606e53c1b9b95e526dd7,566e0c6e3bcf2b8b3d310d96f111043d17ace817,bd3fc24710537130e97dc2dab4a6bf70b3884a8b,...,200.41064,136.42131,6,F7,5dcd5c94a4dbe7000630b08e,1573733085807,-2330,198.186053,138.379526,5dc8cea7659e181adb076a3f
164337,5964a27e0cb3344b0a18540e6b3120c433971c38,346b34a42e801c64e043dbaacbe7fef9b8880774,d090a2f7f222fadeeb64e4fbdfe1ca8451116b04,5953d0b2247e16447d327eb2a8a9c1abe24ff425,180a351ec58c07d60949862c534373c43f548a9a,4d2e5639041b40b0df2ee258aa504bd904133d80,04ac5e0b34d9646b79ad606e53c1b9b95e526dd7,a4a2b67312dcabc5e0ea28a693474b41d24f811c,0ef775dc8eaf20d8012be5ab00323a16694353bf,d36299eab3bb6785995f934cf5f9adfe7bab7b6c,...,200.41064,136.42131,6,F7,5dcd5c94a4dbe7000630b08e,1573733088251,114,200.410640,136.421310,5dc8cea7659e181adb076a3f


In [18]:
import pickle
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
to_pickle(output_dir + f'train_all_wifi{NUM_WIFI}_bin{BIN_TIME}.pkl', all_df)

In [160]:
with open(output_dir + 'test_timestamp_df.pkl', 'rb') as f:
    test_timestamp_df = pickle.load(f)
test_timestamp_df
# test_timestamp_dict = test_timestamp_df.set_index('timestamp')['real_timestamp'].to_dict()

Unnamed: 0,timestamp,site_path_timestamp,real_timestamp,path,no_ibeacon
0,12,5d2709d403f801723c32bd39_52ad8c760ff9978d0949d...,1573789217961,52ad8c760ff9978d0949deed,False
1,4857,5d2709d403f801723c32bd39_52ad8c760ff9978d0949d...,1573789222806,52ad8c760ff9978d0949deed,False
2,13438,5d2709d403f801723c32bd39_52ad8c760ff9978d0949d...,1573789231387,52ad8c760ff9978d0949deed,False
3,21959,5d2709d403f801723c32bd39_52ad8c760ff9978d0949d...,1573789239908,52ad8c760ff9978d0949deed,False
4,28409,5d2709d403f801723c32bd39_52ad8c760ff9978d0949d...,1573789246358,52ad8c760ff9978d0949deed,False
...,...,...,...,...,...
10128,32051,5d2709a003f801723c3251bf_89f1ea2d1e3a876af40da...,1572577164212,89f1ea2d1e3a876af40dadac,False
10129,39259,5d2709a003f801723c3251bf_89f1ea2d1e3a876af40da...,1572577171420,89f1ea2d1e3a876af40dadac,False
10130,47253,5d2709a003f801723c3251bf_89f1ea2d1e3a876af40da...,1572577179414,89f1ea2d1e3a876af40dadac,False
10131,57917,5d2709a003f801723c3251bf_89f1ea2d1e3a876af40da...,1572577190078,89f1ea2d1e3a876af40dadac,False


In [150]:
test_timestamp_df['no_ibeacon'].sum()/len(test_timestamp_df)

0.048652916214349154

In [151]:
# pathとibeaconの有無を表すdict
path_no_ibeacon_dict = test_timestamp_df.groupby('path')['no_ibeacon'].mean().to_dict()

In [163]:
test_timestamp_df.groupby('path')['no_ibeacon'].mean().sum() / len(test_timestamp_df.groupby('path')['no_ibeacon'].mean())

0.054313099041533544

いくつかは補完できていないtimestampも存在する

In [157]:
# 変換
ssubm_df['timestamp'] = ssubm_df['timestamp'].map(test_timestamp_dict)
ssubm_df

Unnamed: 0,site,path,timestamp
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474573154
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474579463
3,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474582900
4,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474586465
...,...,...,...
10128,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,1571887924952
10129,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,1573731146476
10130,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,1573731151613
10131,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,1573731157617


In [158]:
# Generate the features for the test set
feature_dict = dict()

for site, df in ssubm_df.groupby('site'):
    index = sorted(all_bssid[df.iloc[0,0]])
    feats = list()

    # path
    for path, g in df.groupby('path'):

        # get all wifi time locations, 
        with open(os.path.join(base_path, 'test/' + g.iloc[0,1] + '.txt'), encoding='utf-8') as f:
            txt = f.readlines()

        # wifiデータを取り出す
        wifi = list()
        for line in txt:
            line = line.strip().split()
            if line[1] == "TYPE_WIFI":
                wifi.append(line)

        # wifiデータが1つもないケースがあるか確認
        if len(wifi) == 0:
            print(f'wifiデータがないpath:{file}') 
        
        wifi_df = pd.DataFrame(np.array(wifi))
        wifi_df = wifi_df.rename(columns={0:'timestamp', 1:'type_wifi', 2:'ssid', 3:'bssid', 4:'rssi', 5:'freq', 6:'last_seen_timestamp'})
        wifi_df['last_seen_timestamp'] = wifi_df['last_seen_timestamp'].astype(int)

        # # last seen timestampで塊を作る
        wifi_df = grouping_last_seen_timestamp(wifi_df)
        wifi_last_seen_timestamp = pd.DataFrame(wifi_df.groupby('last_seen_timestamp').count().index.tolist())
        wifi_timestamp = pd.DataFrame(wifi_df.groupby('timestamp').count().index.tolist())
        
        # 予測対象のtimestamp
        if path_no_ibeacon_dict[path]:  
            # beaconデータでtimestampを補正できていない場合はlast seenの代わりにtimestampを使用
            for timestamp in g['timestamp'].tolist():

                deltas = (wifi_timestamp.astype(int) - int(timestamp)).abs()  # wifiのlast seenと求めるtimestampの時間差
                min_delta_idx = deltas.values.argmin()
                wifi_block_timestamp = wifi_timestamp[min_delta_idx].values[0]
                timestamp = wifi_timestamp[min_delta_idx].values[0]
                print('True')
                break
                
                wifi_block = wifi_df[wifi_df[0] == wifi_block_timestamp].drop_duplicates(subset=3)
                feat = wifi_block.set_index(3)[4].reindex(index).fillna(-999)

                feat['site_path_timestamp'] = g.iloc[0,0] + "_" + g.iloc[0,1] + "_" + timepoint
                feat['time_diff'] = int(timepoint) - int(wifi_block_timestamp)
                feats.append(feat)
        else:
            # beaconデータでtimestampを補正できている場合はlast seen timestampが近いものを取得
            for timestamp in g['timestamp'].tolist():
                    
                deltas = (wifi_last_seen_timestamp.astype(int) - int(timestamp)).abs()  # wifiのlast seenと求めるtimestampの時間差
                min_delta_idx = deltas.values.argmin()
                wifi_block_timestamp = wifi_last_seen_timestamp[min_delta_idx].values[0]
                last_seen_timestamp = wifi_last_seen_timestamp[min_delta_idx].values[0]
                print('False')
                break
                
                wifi_block = wifi_df[wifi_df[0] == wifi_block_timestamp].drop_duplicates(subset=3)
                feat = wifi_block.set_index(3)[4].reindex(index).fillna(-999)

                feat['site_path_timestamp'] = g.iloc[0,0] + "_" + g.iloc[0,1] + "_" + timepoint
                feat['time_diff'] = int(timepoint) - int(wifi_block_timestamp)
                feats.append(feat)
        break
    break
    feature_df = pd.concat(feats, axis=1).T
    feature_df.to_csv(output_dir + gid0 + "_test.csv")

False


In [159]:
g

Unnamed: 0,site,path,timestamp
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474573154
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474579463
3,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474582900
4,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474586465
5,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474594083
6,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474604420
7,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474615480
8,5a0546857ecc773753327266,046cfa46be49fc10834815c6,1578474628151
