## create dataset 

v3:delta学習用のデータセットを作成する 

In [1]:
import pandas as pd
import numpy as np
import os
import glob

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
root_dir = '../input/'

In [3]:
import pickle
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)


def from_pickle(filename):
    with open(filename, mode='rb') as f:
        obj = pickle.load(f)
    return obj

## 2段階目(delta用のデータセット作成)

In [4]:
train_df = pd.read_csv(root_dir + 'baseline_locations_train.csv')
test_df = pd.read_csv(root_dir + 'baseline_locations_test.csv')

In [5]:
input_train_dir = root_dir + 'imu_dataset_v0/train/'
train_acc_df = from_pickle(input_train_dir + 'acc.pkl')
train_gyro_df = from_pickle(input_train_dir + 'gyro.pkl')
train_mag_df = from_pickle(input_train_dir + 'mag.pkl')

input_test_dir = root_dir + 'imu_dataset_v0/test/'
test_acc_df = from_pickle(input_test_dir + 'acc.pkl')
test_gyro_df = from_pickle(input_test_dir + 'gyro.pkl')
test_mag_df = from_pickle(input_test_dir + 'mag.pkl')

In [6]:
def normalized(df, col_list):
    for col in col_list:
        df[col] = (df[col] - df[col].mean())/df[col].std()
    return df

In [7]:
# acc_col_list = ['UncalAccelXMps2','UncalAccelYMps2','UncalAccelZMps2','BiasXMps2','BiasYMps2','BiasZMps2']
acc_col_list = ['UncalAccelXMps2','UncalAccelYMps2','UncalAccelZMps2']
train_acc_df = normalized(train_acc_df, acc_col_list)
test_acc_df = normalized(test_acc_df, acc_col_list)

In [8]:
gyro_col_list = ['UncalGyroXRadPerSec','UncalGyroYRadPerSec','UncalGyroZRadPerSec']
train_gyro_df = normalized(train_gyro_df, gyro_col_list)
test_gyro_df = normalized(test_gyro_df, gyro_col_list)

In [9]:
mag_col_list = ['UncalMagXMicroT','UncalMagYMicroT','UncalMagZMicroT']
train_mag_df = normalized(train_mag_df, mag_col_list)
test_mag_df = normalized(test_mag_df, mag_col_list)

In [10]:
def utc2gps(acc_df, mag_df, gyro_df):
    acc_df['millisSinceGpsEpoch'] = acc_df['utcTimeMillis'] - 315964800000  # -27 TODO 衛星によってうるう秒を考慮
    mag_df['millisSinceGpsEpoch'] = mag_df['utcTimeMillis'] - 315964800000  # -27 TODO 衛星によってうるう秒を考慮
    gyro_df['millisSinceGpsEpoch'] = gyro_df['utcTimeMillis'] - 315964800000  # -27 TODO 衛星によってうるう秒を考慮
    return acc_df, mag_df, gyro_df

In [11]:
train_acc_df, train_mag_df, train_gyro_df = utc2gps(train_acc_df, train_mag_df, train_gyro_df)
test_acc_df, test_mag_df, test_gyro_df = utc2gps(test_acc_df, test_mag_df, test_gyro_df)

In [12]:
output_train_dir = root_dir + 'imu_dataset_v2/train/'
os.makedirs(output_train_dir, exist_ok=True)

output_test_dir = root_dir + 'imu_dataset_v2/test/'
os.makedirs(output_test_dir, exist_ok=True)

In [13]:
NUM_DATA = 100

In [14]:
def create_train_imu_dataset(args):
    imu_dict = {}
    (collection_name, phone_name), target_df = args

    acc_df = train_acc_df[(train_acc_df["collectionName"]==collection_name)&(train_acc_df["phoneName"]==phone_name)].sort_values('millisSinceGpsEpoch')
    gyro_df = train_gyro_df[(train_gyro_df["collectionName"]==collection_name)&(train_gyro_df["phoneName"]==phone_name)].sort_values('millisSinceGpsEpoch')
    mag_df = train_mag_df[(train_mag_df["collectionName"]==collection_name)&(train_mag_df["phoneName"]==phone_name)].sort_values('millisSinceGpsEpoch')
    
    epoch_list = target_df["millisSinceGpsEpoch"].to_list()
    # for i in tqdm(range(len(epoch_list)-1)):
    for i in range(len(epoch_list)-1):
        epoch_min = epoch_list[i]
        epoch_max = epoch_list[i + 1]
        
        if i==0:
            imu_dict[str(epoch_min)] = np.full((9, NUM_DATA), 0)
        
        target_acc_df = acc_df[(epoch_min <= acc_df['millisSinceGpsEpoch'])&(acc_df['millisSinceGpsEpoch'] < epoch_max)]
        target_mag_df = mag_df[(epoch_min <= mag_df['millisSinceGpsEpoch'])&(mag_df['millisSinceGpsEpoch'] < epoch_max)]
        target_gyro_df = gyro_df[(epoch_min <= gyro_df['millisSinceGpsEpoch'])&(gyro_df['millisSinceGpsEpoch'] < epoch_max)]

        if len(target_acc_df)==0:
            target_acc_np = np.full((3, NUM_DATA), 0)
        else:
            target_acc_df['bin'] = pd.cut(target_acc_df['millisSinceGpsEpoch'], NUM_DATA)
            target_acc_df = target_acc_df.groupby('bin').mean().reset_index(drop=True)
            target_acc_np = target_acc_df[['UncalAccelXMps2','UncalAccelYMps2','UncalAccelZMps2']].to_numpy().T

        if len(target_mag_df)==0:
            target_mag_np = np.full((3, NUM_DATA), 0)
        else:
            target_mag_df['bin'] = pd.cut(target_mag_df['millisSinceGpsEpoch'], NUM_DATA)
            target_mag_df = target_mag_df.groupby('bin').mean().reset_index(drop=True)
            target_mag_np = target_mag_df[['UncalMagXMicroT','UncalMagYMicroT','UncalMagZMicroT']].to_numpy().T

        if len(target_gyro_df)==0:
            target_gyro_np = np.full((3, NUM_DATA), 0)
        else:
            target_gyro_df['bin'] = pd.cut(target_gyro_df['millisSinceGpsEpoch'], NUM_DATA)
            target_gyro_df = target_gyro_df.groupby('bin').mean().reset_index(drop=True)
            target_gyro_np = target_gyro_df[['UncalGyroXRadPerSec','UncalGyroYRadPerSec','UncalGyroZRadPerSec']].to_numpy().T
        
        target_np = np.concatenate([target_acc_np, target_mag_np, target_gyro_np], axis=0)
        assert target_np.shape == (9,100)
        
        target_np[np.isnan(target_np)] = 0
        imu_dict[str(epoch_max)] = target_np

    filename = output_train_dir + f'{collection_name}_{phone_name}.pkl'
    to_pickle(filename, imu_dict)
    return 0


In [15]:
# train
import multiprocessing
processes = multiprocessing.cpu_count()
len_gr = len(train_df.groupby(["collectionName", "phoneName"]).mean())
with multiprocessing.Pool(processes=processes) as pool:
    gr = train_df.groupby(["collectionName", "phoneName"])
    zero = pool.imap_unordered(create_train_imu_dataset, gr)
    zero = list(tqdm(zero, total=len_gr))

  0%|          | 0/73 [00:00<?, ?it/s]

In [16]:
def create_test_imu_dataset(args):
    imu_dict = {}
    (collection_name, phone_name), target_df = args

    acc_df = test_acc_df[(test_acc_df["collectionName"]==collection_name)&(test_acc_df["phoneName"]==phone_name)].sort_values('millisSinceGpsEpoch')
    gyro_df = test_gyro_df[(test_gyro_df["collectionName"]==collection_name)&(test_gyro_df["phoneName"]==phone_name)].sort_values('millisSinceGpsEpoch')
    mag_df = test_mag_df[(test_mag_df["collectionName"]==collection_name)&(test_mag_df["phoneName"]==phone_name)].sort_values('millisSinceGpsEpoch')
    
    epoch_list = target_df["millisSinceGpsEpoch"].to_list()
    # for i in tqdm(range(len(epoch_list)-1)):
    for i in range(len(epoch_list)-1):
        epoch_min = epoch_list[i]
        epoch_max = epoch_list[i + 1]
        
        if i==0:
            imu_dict[str(epoch_min)] = np.full((9, NUM_DATA), 0)
        
        target_acc_df = acc_df[(epoch_min <= acc_df['millisSinceGpsEpoch'])&(acc_df['millisSinceGpsEpoch'] < epoch_max)]
        target_mag_df = mag_df[(epoch_min <= mag_df['millisSinceGpsEpoch'])&(mag_df['millisSinceGpsEpoch'] < epoch_max)]
        target_gyro_df = gyro_df[(epoch_min <= gyro_df['millisSinceGpsEpoch'])&(gyro_df['millisSinceGpsEpoch'] < epoch_max)]

        if len(target_acc_df)==0:
            target_acc_np = np.full((3, NUM_DATA), 0)
        else:
            target_acc_df['bin'] = pd.cut(target_acc_df['millisSinceGpsEpoch'], NUM_DATA)
            target_acc_df = target_acc_df.groupby('bin').mean().reset_index(drop=True)
            target_acc_np = target_acc_df[['UncalAccelXMps2','UncalAccelYMps2','UncalAccelZMps2']].to_numpy().T

        if len(target_mag_df)==0:
            target_mag_np = np.full((3, NUM_DATA), 0)
        else:
            target_mag_df['bin'] = pd.cut(target_mag_df['millisSinceGpsEpoch'], NUM_DATA)
            target_mag_df = target_mag_df.groupby('bin').mean().reset_index(drop=True)
            target_mag_np = target_mag_df[['UncalMagXMicroT','UncalMagYMicroT','UncalMagZMicroT']].to_numpy().T

        if len(target_gyro_df)==0:
            target_gyro_np = np.full((3, NUM_DATA), 0)
        else:
            target_gyro_df['bin'] = pd.cut(target_gyro_df['millisSinceGpsEpoch'], NUM_DATA)
            target_gyro_df = target_gyro_df.groupby('bin').mean().reset_index(drop=True)
            target_gyro_np = target_gyro_df[['UncalGyroXRadPerSec','UncalGyroYRadPerSec','UncalGyroZRadPerSec']].to_numpy().T
        
        target_np = np.concatenate([target_acc_np, target_mag_np, target_gyro_np], axis=0)
        assert target_np.shape == (9,100)
        
        target_np[np.isnan(target_np)] = 0
        imu_dict[str(epoch_max)] = target_np

    filename = output_test_dir + f'{collection_name}_{phone_name}.pkl'
    to_pickle(filename, imu_dict)
    return 0


In [17]:
# test
import time
for (collection_name, phone_name), df in tqdm(test_df.groupby(["collectionName", "phoneName"])):
    print(collection_name, phone_name)
    start = time.time()
    zero = create_test_imu_dataset(((collection_name, phone_name), df))
    elapsed_time = time.time() - start
    print(elapsed_time)

  0%|          | 0/48 [00:00<?, ?it/s]

2020-05-15-US-MTV-1 Pixel4
59.47429585456848
2020-05-15-US-MTV-1 Pixel4XL
59.54811334609985
2020-05-28-US-MTV-1 Pixel4
33.25453495979309
2020-05-28-US-MTV-1 Pixel4XL
35.652923583984375
2020-05-28-US-MTV-2 Pixel4
34.13585901260376
2020-05-28-US-MTV-2 Pixel4XL
34.24012589454651
2020-05-28-US-MTV-2 Pixel4XLModded
22.36473536491394
2020-06-04-US-MTV-2 Pixel4
27.191681623458862
2020-06-04-US-MTV-2 Pixel4XL
27.070902109146118
2020-06-04-US-MTV-2 Pixel4XLModded
27.178017377853394
2020-06-10-US-MTV-1 Pixel4
26.720977544784546
2020-06-10-US-MTV-1 Pixel4XL
26.159168004989624
2020-06-10-US-MTV-1 Pixel4XLModded
26.803060054779053
2020-06-10-US-MTV-2 Pixel4
28.661839723587036
2020-06-10-US-MTV-2 Pixel4XL
28.666541576385498
2020-06-10-US-MTV-2 Pixel4XLModded
29.214104175567627
2020-08-03-US-MTV-2 Mi8
4.856034994125366
2020-08-03-US-MTV-2 Pixel4
4.817697286605835
2020-08-03-US-MTV-2 Pixel4XL
4.8301050662994385
2020-08-13-US-MTV-1 Mi8
5.678229331970215
2020-08-13-US-MTV-1 Pixel4
5.5143961906433105
202

In [18]:
# # test
# import multiprocessing
# processes = multiprocessing.cpu_count()
# len_gr = len(test_df.groupby(["collectionName", "phoneName"]).mean())
# with multiprocessing.Pool(processes=processes) as pool:
#     gr = test_df.groupby(["collectionName", "phoneName"])
#     zero = pool.imap_unordered(create_test_imu_dataset, gr)
#     zero = list(tqdm(zero, total=len_gr))

### IMUデータを使えなさそうな端末(train)
- Mi8すべて
- 2020-08-06-US-MTV-2-Pixel4
- 2020-08-03-US-MTV-1-Pixel4
- 2020-08-06-US-MTV-2-Pixel4XL

以下collectionはすべての端末でIMUが使えなそう
- 2020-08-03-US-MTV-1
- 2020-08-06-US-MTV-2
- 2020-07-17-US-MTV-1
- 2020-07-17-US-MTV-2
ひとまずこれらは放置