## wifi feature by kuto

[yukiさんのdataset](https://www.kaggle.com/dataset/951fe0fd675e58937311e936e941b01d517c227ce30192c8477bbe2bddeec602)をもとにを参考に、waypointを補正したwifi featureを作成する。  
trainにwifi_x, wifi_yが加わっただけ。  

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import glob
import os

In [None]:
base_path = '..'
feature_dir = f"{base_path}/input/wifi-feature-with-timestamp"
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))

In [None]:
train_file = train_files[0]
train = pd.read_csv(train_file, index_col=0).reset_index(drop=True).rename(columns={'foor_str':'floor_str'})
train

In [None]:
train_files

In [None]:
import sys 
sys.path.append("../")
from multiprocessing import Pool

# io_f, compute_fはコンペのgithubから持ってきたファイル
from src.io_f import read_data_file
import src.compute_f as compute_f

# cost minimizationから引用
def compute_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_f.compute_steps(acce_datas)
    headings = compute_f.compute_headings(ahrs_datas)
    stride_lengths = compute_f.compute_stride_length(step_acce_max_mins)
    step_headings = compute_f.compute_step_heading(step_timestamps, headings)
    rel_positions = compute_f.compute_rel_positions(stride_lengths, step_headings)
    return rel_positions


def complement_sensor_waypoint(trajectory_timestamp, trajectory_waypoint, sensor):
    sensor_timestamp = sensor[:, 0]
    sensor_rel_waypoint = sensor[:, 1:]
    sensor_waypoint = []
    for i in range(len(trajectory_timestamp)-1):
        # あるwaypointから次のwaypointの間にあるsensorデータを取り出したいのでtimestampで該当のidxを取得
        if i == len(trajectory_timestamp)-2:
            # 最後のwaypointより未来のセンサデータがあるようなのでこの処理を追加
            target_idx = (sensor_timestamp >= trajectory_timestamp[i])
        else:
            target_idx = (sensor_timestamp >= trajectory_timestamp[i]) & (sensor_timestamp < trajectory_timestamp[i+1])
        # 対象区間の相対位置の累積和を取りスタート地点のwaypointを足すことでsensorによる位置が取得できる
        tmp_sensor_waypoint = trajectory_waypoint[i] + np.cumsum(sensor_rel_waypoint[target_idx], axis=0)
        sensor_waypoint.append(tmp_sensor_waypoint)
    sensor_waypoint = np.concatenate(sensor_waypoint)
    return sensor_timestamp, sensor_waypoint


# wifiのtimestampに最も近いものをsensor_timestampから取得しそれに対応するwaypointをwifiのwaypointとして取得
def complement_wifi_waypoint(wifi_timestamp, sensor_timestamp, sensor_waypoint):
    x_sensor = sensor_waypoint[:,0]
    y_sensor = sensor_waypoint[:,1]
    wifi_x_by_sensor = []
    wifi_y_by_sensor = []
    wifi_timestamp_by_sensor = []
    for i in wifi_timestamp:
        idx = np.abs(sensor_timestamp - i).argmin()  # wifiデータとtimestampが最も近いものをsensor_timestampから取得
        wifi_timestamp_by_sensor.append(sensor_timestamp[idx])
        wifi_x_by_sensor.append(x_sensor[idx])
        wifi_y_by_sensor.append(y_sensor[idx])

    wifi_waypoint_by_sensor = np.stack([wifi_x_by_sensor, wifi_y_by_sensor], axis=1)
    return wifi_waypoint_by_sensor


def get_wifi_waypoint(site, floor, path, timestamp):
    path_file = f'../input/indoor-location-navigation/train/{site}/{floor}/{path}.txt'
    example = read_data_file(path_file)
    trajectory = example.waypoint
    wifi = example.wifi
    sensor = compute_rel_positions(example.acce, example.ahrs)

    trajectory_timestamp = trajectory[:,0]
    trajectory_waypoint = trajectory[:, 1:]
    wifi_timestamp = np.unique(wifi[:, 0]).astype(int)
    sensor_timestamp, sensor_waypoint = complement_sensor_waypoint(trajectory_timestamp, trajectory_waypoint, sensor)
    wifi_waypoint = complement_wifi_waypoint(wifi_timestamp, sensor_timestamp, sensor_waypoint)

    return wifi_waypoint

In [None]:

def make_dataset_for_train(train_file):
    train = pd.read_csv(train_file, index_col=0).reset_index(drop=True).rename(columns={'foor_str':'floor_str'})
    file_name = train_file.split('/')[-1]
    num_of_lines = train.shape[0]
    print(f'{file_name} : {num_of_lines}')

    data_list = []
    for path, df in tqdm(train.groupby('path')):

        bssid = df.columns[:-7].values
        rssi = df.iloc[:, :-7].values
        targets = df.iloc[:,-7:].values

        # path内の各waypointのwifi値を取得
        sort_rssi = []
        sort_bssid = []
        for i in range(len(df)):
            sort_rssi.append(np.sort(rssi[i])[::-1][:100])
            idx = np.argsort(rssi[i])[::-1]
            sort_bssid.append(bssid[idx][:100])

        sort_rssi = np.stack(sort_rssi)
        sort_bssid = np.stack(sort_bssid)
        site = train_file.split('/')[-1].split('_')[0]
        timestamp = targets[i][0]
  
        wifi_waypoint = get_wifi_waypoint(site, floor, path, timestamp)  # 1行ずつ処理するのは計算コスト的にもったいない
        path_data = np.concatenate((sort_bssid, sort_rssi, targets, wifi_waypoint), axis=1)
        path_df = pd.DataFrame(path_data)
        data_list.append(path_df)
    
    data_df = pd.concat(data_list)
    columns = [f'bssid_{str(i)}' for i in range(100)] + [f'rssi_{str(i)}' for i in range(100)] + \
            ['timestamp', 'x', 'y', 'floor', 'floor_str', 'path', 'time_diff', 'wifi_x', 'wifi_y']
    data_df.columns = columns
    data_df.to_csv(f'../input/kuto_wifi_dataset_v1/{file_name}', index=False)


In [None]:
with Pool(processes=24) as pool:
    pool.map(make_dataset_for_train, train_files)

In [None]:
# このsiteだけ失敗したので再度実行(以下のファイルのwaypointの改行がズレている)
# indoor-location-navigation/train/5c3c44b80379370013e0fd2b/F1/5d077e040e86b60008036270.txt
# indoor-location-navigation/train/5c3c44b80379370013e0fd2b/F2/5d0795110e86b600080363bc.txt
make_dataset_for_train('../input/wifi-feature-with-timestamp/5c3c44b80379370013e0fd2b_timediff_1000_train.csv')

In [None]:
def make_dataset_for_test(test_file):
    train = pd.read_csv(test_file)

    file_name = test_file.split('/')[-1]

    num_of_lines = train.shape[0]
    print(f'{file_name} : {num_of_lines}')

    data = None
    for i in tqdm(range(num_of_lines)):

        tmp = train.iloc[i,1:-2].astype(int).sort_values(ascending=False).head(100)
        target = train.iloc[i, -2:]

        line = pd.concat([pd.Series(tmp.index.astype(str)), tmp.astype(int), pd.Series(target)])
        line.index = [str(i) for i in range(202)]
        if data is None:
            data = pd.DataFrame(line).T
            data.columns = [str(i) for i in range(202)]
        else:
            data = data.append(line, ignore_index=True)
    data.columns = [f'bssid_{str(i)}' for i in range(100)] + [f'rssi_{str(i)}' for i in range(100)] + ['site_path_timestamp', 'time_diff']
    data.to_csv(f'../input/kuto_wifi_dataset_v1/{file_name}', index=False)

In [None]:
with Pool(processes=24) as pool:
    pool.map(make_dataset_for_test, test_files)

In [None]:
import pickle
dfs = []
for f in glob.glob('../input/kuto_wifi_dataset_v1/*train.csv'):
    site_id = f.split('/')[-1].split('_')[0]
    _df = pd.read_csv(f)
    _df['site_id'] = site_id
    dfs.append(_df)

In [None]:
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
to_pickle('../input/kuto_wifi_dataset_v1/train_all.pkl', pd.concat(dfs).reset_index(drop=True))

In [None]:
dfs = []
for f in glob.glob('../input/kuto_wifi_dataset_v1/*test.csv'):
    site_id = f.split('/')[-1].split('_')[0]
    _df = pd.read_csv(f)
    _df['site_id'] = site_id
    dfs.append(_df)

In [None]:
to_pickle('../input/kuto_wifi_dataset_v1/test_all.pkl', pd.concat(dfs).reset_index(drop=True))