## create dataset 

v3:delta学習用のデータセットを作成する 

In [1]:
import pandas as pd
import numpy as np
import os
import glob

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
root_dir = '../input/'

In [3]:
# def mean_data_by_epoch(df,delta=250):
#     if len(df)==0:
#         pass
#     else:
#         df = df.astype(float)
#         bins = round((df['utcTimeMillis'].max() - df['utcTimeMillis'].min())/delta)
#         df['bin'] = pd.cut(df['utcTimeMillis'], bins)
#         df = df.groupby('bin').mean().reset_index(drop=True)
#     return df

# def add_name_columns(df, collection_name, phone_name):
#     df['collectionName'] = collection_name
#     df['phoneName'] = phone_name
#     return df

# def postprocess(df, collection_name, phone_name):
#     df = mean_data_by_epoch(df)
#     df = add_name_columns(df, collection_name, phone_name)
#     return df

In [4]:
import pickle
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
def from_pickle(filename):
    with open(filename, mode='rb') as f:
        obj = pickle.load(f)
    return obj

## 1段階目(ホストのコードでtxt2dfする)

In [5]:
output_train_dir = root_dir + 'imu_dataset_v0/train/'
os.makedirs(output_train_dir, exist_ok=True)

In [6]:
output_test_dir = root_dir + 'imu_dataset_v0/test/'
os.makedirs(output_test_dir, exist_ok=True)

In [12]:
def gnss_log_to_dataframes(path):
    # print('\nLoading ' + path, flush=True)
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for i, dataline in enumerate(datalines):
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            gnss_map[dataline[0]] = dataline[1:]
        elif not is_header:
            datas[dataline[0]].append(dataline[1:])
    
    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            results[k][col] = pd.to_numeric(results[k][col])
    print(results)
    raise
    
    return results

In [13]:
def create_imu_base_dataset(collection_path):
    phase = collection_path.split('/')[2]
    for phone_path in glob.glob(os.path.join(collection_path, '*')):
        collection_name = phone_path.split('/')[3]
        phone_name = phone_path.split('/')[4]
        # get GnssLog file 
        for file_path in glob.glob(os.path.join(phone_path, f"{phone_name}_GnssLog.txt")):
            if phase == 'train':
                filename = output_train_dir + f'{collection_name}_{phone_name}.pkl'
            else:
                filename = output_test_dir + f'{collection_name}_{phone_name}.pkl'
            result_dict = gnss_log_to_dataframes(file_path)
            to_pickle(filename, result_dict)
    return 0

In [14]:
# train
import multiprocessing
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    path_list = glob.glob(os.path.join(root_dir, 'train/*'))
    zero = pool.imap_unordered(create_imu_base_dataset, path_list)
    zero = list(tqdm(zero, total=len(path_list)))

  0%|          | 0/29 [00:00<?, ?it/s]

{'UncalAccel': Empty DataFrame
Columns: [utcTimeMillis, elapsedRealtimeNanos, UncalAccelXMps2, UncalAccelYMps2, UncalAccelZMps2, BiasXMps2, BiasYMps2, BiasZMps2]
Index: [], 'Status': Empty DataFrame
Columns: [UnixTimeMillis, SignalCount, SignalIndex, ConstellationType, Svid, CarrierFrequencyHz, Cn0DbHz, AzimuthDegrees, ElevationDegrees, UsedInFix, HasAlmanacData, HasEphemerisData]
Index: [], 'Raw':        utcTimeMillis      TimeNanos  LeapSecond  TimeUncertaintyNanos  \
0      1596754334000  3277067000000         NaN                   NaN   
1      1596754334000  3277067000000         NaN                   NaN   
2      1596754334000  3277067000000         NaN                   NaN   
3      1596754334000  3277067000000         NaN                   NaN   
4      1596754334000  3277067000000         NaN                   NaN   
...              ...            ...         ...                   ...   
67082  1596756077000  5020067000000         NaN                   NaN   
67083  1596756

RuntimeError: No active exception to reraise

In [10]:
# test
import multiprocessing
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    path_list = glob.glob(os.path.join(root_dir, 'test/*'))
    zero = pool.imap_unordered(create_imu_base_dataset, path_list)
    zero = list(tqdm(zero, total=len(path_list)))

  0%|          | 0/19 [00:00<?, ?it/s]

## 2段階目(delta用のデータセット作成)

In [11]:
def get_ground_truth(args):
    (collectionName, phoneName), df = args
    
    path = root_dir + f"train/{collectionName}/{phoneName}/ground_truth.csv"
    target_df = pd.read_csv(path)
    output_df = pd.DataFrame()
    # merge derived and target by 'millisSinceGpsEpoch'
    for epoch, epoch_df in df.groupby('millisSinceGpsEpoch'):
        idx = (target_df['millisSinceGpsEpoch'] - epoch).abs().argmin()
        epoch_diff = epoch - target_df.loc[idx, 'millisSinceGpsEpoch']
        epoch_df['epoch_diff'] = epoch_diff
        epoch_df['target_latDeg'] = target_df.loc[idx, 'latDeg']
        epoch_df['target_lngDeg'] = target_df.loc[idx, 'lngDeg']
        output_df = pd.concat([output_df, epoch_df]).reset_index(drop=True)    
    return output_df

In [12]:
train_df = pd.read_csv(root_dir + 'baseline_locations_train.csv')
test_df = pd.read_csv(root_dir + 'baseline_locations_test.csv')

In [13]:
# import multiprocessing

# processes = multiprocessing.cpu_count()
# with multiprocessing.Pool(processes=processes) as pool:
#     gr = train_df.groupby(['collectionName','phoneName'])
#     dfs = pool.imap_unordered(get_ground_truth, gr)
#     dfs = tqdm(dfs, total=len(gr))
#     dfs = list(dfs)
# train_df = pd.concat(dfs).sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch']).reset_index(drop=True)
# truth_df = train_df[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'target_latDeg', 'target_lngDeg']]
# truth_df = truth_df.rename(columns={'target_latDeg':'latDeg','target_lngDeg':'lngDeg'})
# truth_df

In [14]:
def create_imu_dataset(sample_path):
    sample_dict = from_pickle(sample_path)
    
    phase = sample_path.split('/')[3]
    collection_name = sample_path.split('/')[4].split('_')[0]
    phone_name = sample_path.split('/')[4].split('_')[1].split('.')[0]
    
    acc_df = sample_dict['UncalAccel']
    mag_df = sample_dict['UncalMag']
    gyro_df = sample_dict['UncalGyro']

    if 'BiasXMps2' not in acc_df.columns:
        acc_df[['BiasXMps2','BiasYMps2','BiasZMps2']] = np.nan
    if 'BiasXMicroT' not in mag_df.columns:
        mag_df[['BiasXMicroT','BiasYMicroT','BiasZMicroT']] = np.nan
    if 'DriftXRadPerSec' not in gyro_df.columns:
        gyro_df[['DriftXRadPerSec','DriftYRadPerSec','DriftZRadPerSec']] = np.nan
    
    acc_df['millisSinceGpsEpoch'] = acc_df['utcTimeMillis'] - 315964800000  # -27 TODO 衛星によってうるう秒を考慮
    mag_df['millisSinceGpsEpoch'] = mag_df['utcTimeMillis'] - 315964800000  # -27 TODO 衛星によってうるう秒を考慮
    gyro_df['millisSinceGpsEpoch'] = gyro_df['utcTimeMillis'] - 315964800000  # -27 TODO 衛星によってうるう秒を考慮

    imu_dict = {}
    
    if phase == 'train':
        tmp_df = train_df[(train_df['collectionName']==collection_name)&(train_df['phoneName']==phone_name)].reset_index(drop=True)
    else:
        tmp_df = test_df[(test_df['collectionName']==collection_name)&(test_df['phoneName']==phone_name)].reset_index(drop=True)
    epoch_list = tmp_df["millisSinceGpsEpoch"].to_list()
    
    acc_nan_count = 0
    mag_nan_count = 0
    gyro_nan_count = 0
    
    NUM_DATA = 100
    # for i in tqdm(range(len(epoch_list)-1)):
    for i in range(len(epoch_list)-1):
        epoch_min = epoch_list[i]
        epoch_max = epoch_list[i + 1]
        
        if i==0:
            imu_dict[str(epoch_min)] = np.full((18, NUM_DATA), np.nan)

        target_acc_df = acc_df[(epoch_min <= acc_df['millisSinceGpsEpoch'])&(acc_df['millisSinceGpsEpoch'] < epoch_max)]
        target_mag_df = mag_df[(epoch_min <= mag_df['millisSinceGpsEpoch'])&(mag_df['millisSinceGpsEpoch'] < epoch_max)]
        target_gyro_df = gyro_df[(epoch_min <= gyro_df['millisSinceGpsEpoch'])&(gyro_df['millisSinceGpsEpoch'] < epoch_max)]

        if len(target_acc_df)==0:
            target_acc_np = np.full((6, NUM_DATA), np.nan)
            acc_nan_count += 1
        else:
            target_acc_df['bin'] = pd.cut(target_acc_df['millisSinceGpsEpoch'], NUM_DATA)
            target_acc_df = target_acc_df.groupby('bin').mean().reset_index(drop=True)
            target_acc_np = target_acc_df[['UncalAccelXMps2','UncalAccelYMps2','UncalAccelZMps2','BiasXMps2','BiasYMps2','BiasZMps2']].to_numpy().T

        if len(target_mag_df)==0:
            target_mag_np = np.full((6, NUM_DATA), np.nan)
            mag_nan_count += 1
        else:
            target_mag_df['bin'] = pd.cut(target_mag_df['millisSinceGpsEpoch'], NUM_DATA)
            target_mag_df = target_mag_df.groupby('bin').mean().reset_index(drop=True)
            target_mag_np = target_mag_df[['UncalMagXMicroT','UncalMagYMicroT','UncalMagZMicroT','BiasXMicroT','BiasYMicroT','BiasZMicroT']].to_numpy().T

        if len(target_gyro_df)==0:
            target_gyro_np = np.full((6, NUM_DATA), np.nan)
            gyro_nan_count += 1
        else:
            target_gyro_df['bin'] = pd.cut(target_gyro_df['millisSinceGpsEpoch'], NUM_DATA)
            target_gyro_df = target_gyro_df.groupby('bin').mean().reset_index(drop=True)
            target_gyro_np = target_gyro_df[['UncalGyroXRadPerSec','UncalGyroYRadPerSec','UncalGyroZRadPerSec','DriftXRadPerSec','DriftYRadPerSec','DriftZRadPerSec']].to_numpy().T
        target_np = np.concatenate([target_acc_np, target_mag_np, target_gyro_np], axis=0)
        assert target_np.shape == (18,100)
        
        target_np[np.isnan(target_np)] = 99999
        imu_dict[str(epoch_max)] = target_np
    
    print(f"\n{collection_name}-{phone_name}: (acc){acc_nan_count/len(epoch_list)}")
    print(f"{collection_name}-{phone_name}: (mag){mag_nan_count/len(epoch_list)}")
    print(f"{collection_name}-{phone_name}: (gyro){gyro_nan_count/len(epoch_list)}")
    
    if phase == 'train':
        filename = output_train_dir2 + f'{collection_name}_{phone_name}.pkl'
    else:
        filename = output_test_dir2 + f'{collection_name}_{phone_name}.pkl'
    to_pickle(filename, imu_dict)
    return 0

In [15]:
output_train_dir2 = root_dir + 'imu_dataset_v1/train/'
os.makedirs(output_train_dir2, exist_ok=True)

output_test_dir2 = root_dir + 'imu_dataset_v1/test/'
os.makedirs(output_test_dir2, exist_ok=True)

In [16]:
# train
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    path_list = glob.glob(output_train_dir + '*')
    zero = pool.imap_unordered(create_imu_dataset, path_list)
    zero = list(tqdm(zero, total=len(path_list)))

  0%|          | 0/73 [00:00<?, ?it/s]


2020-08-06-US-MTV-2-Pixel4: (acc)0.9994416527079844
2020-08-06-US-MTV-2-Pixel4: (mag)0.9994416527079844
2020-08-06-US-MTV-2-Pixel4: (gyro)0.9994416527079844

2020-08-06-US-MTV-2-Mi8: (acc)0.9994266055045872
2020-08-06-US-MTV-2-Mi8: (mag)0.9994266055045872
2020-08-06-US-MTV-2-Mi8: (gyro)0.9994266055045872

2020-08-03-US-MTV-1-Mi8: (acc)0.999493670886076
2020-08-03-US-MTV-1-Mi8: (mag)0.999493670886076
2020-08-03-US-MTV-1-Mi8: (gyro)0.999493670886076

2020-08-03-US-MTV-1-Pixel4: (acc)0.9995022399203584
2020-08-03-US-MTV-1-Pixel4: (mag)0.9995022399203584
2020-08-03-US-MTV-1-Pixel4: (gyro)0.9995022399203584

2020-09-04-US-SF-2-Mi8: (acc)0.918
2020-09-04-US-SF-2-Mi8: (mag)0.918
2020-09-04-US-SF-2-Mi8: (gyro)0.918

2020-09-04-US-SF-1-Mi8: (acc)0.8862586605080831
2020-09-04-US-SF-1-Mi8: (mag)0.8862586605080831
2020-09-04-US-SF-1-Mi8: (gyro)0.8862586605080831

2021-04-26-US-SVL-1-Pixel5: (acc)0.01644100580270793
2021-04-26-US-SVL-1-Pixel5: (mag)0.01644100580270793
2021-04-26-US-SVL-1-Pixel5: (

In [17]:
# test
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    path_list = glob.glob(output_test_dir + '*')
    zero = pool.imap_unordered(create_imu_dataset, path_list)
    zero = list(tqdm(zero, total=len(path_list)))

  0%|          | 0/48 [00:00<?, ?it/s]


2020-08-03-US-MTV-2-Pixel4XL: (acc)0.9994103773584906
2020-08-03-US-MTV-2-Pixel4XL: (mag)0.9994103773584906
2020-08-03-US-MTV-2-Pixel4XL: (gyro)0.9994103773584906

2020-08-03-US-MTV-2-Mi8: (acc)0.9994262765347103
2020-08-03-US-MTV-2-Mi8: (mag)0.9994262765347103
2020-08-03-US-MTV-2-Mi8: (gyro)0.9994262765347103

2020-08-13-US-MTV-1-Pixel4: (acc)0.999554565701559
2020-08-13-US-MTV-1-Pixel4: (mag)0.999554565701559
2020-08-13-US-MTV-1-Pixel4: (gyro)0.999554565701559

2020-08-13-US-MTV-1-Mi8: (acc)0.9995730145175064
2020-08-13-US-MTV-1-Mi8: (mag)0.9995730145175064
2020-08-13-US-MTV-1-Mi8: (gyro)0.9995730145175064

2020-08-03-US-MTV-2-Pixel4: (acc)0.9994269340974212
2020-08-03-US-MTV-2-Pixel4: (mag)0.9994269340974212
2020-08-03-US-MTV-2-Pixel4: (gyro)0.9994269340974212

2021-04-08-US-MTV-1-SamsungS20Ultra: (acc)0.0
2021-04-08-US-MTV-1-SamsungS20Ultra: (mag)0.0
2021-04-08-US-MTV-1-SamsungS20Ultra: (gyro)0.0

2021-04-08-US-MTV-1-Pixel5: (acc)0.014049586776859505
2021-04-08-US-MTV-1-Pixel5: (m

### IMUデータを使えなさそうな端末(train)
- Mi8すべて
- 2020-08-06-US-MTV-2-Pixel4
- 2020-08-03-US-MTV-1-Pixel4
- 2020-08-06-US-MTV-2-Pixel4XL

以下collectionはすべての端末でIMUが使えなそう
- 2020-08-03-US-MTV-1
- 2020-08-06-US-MTV-2
- 2020-07-17-US-MTV-1
- 2020-07-17-US-MTV-2
ひとまずこれらは放置

## test

In [18]:
sub = pd.read_csv(root_dir + 'sample_submission.csv')
sub['phone'].nunique(), sub['phone'].unique()

(48,
 array(['2020-05-15-US-MTV-1_Pixel4', '2020-05-15-US-MTV-1_Pixel4XL',
        '2020-05-28-US-MTV-1_Pixel4', '2020-05-28-US-MTV-1_Pixel4XL',
        '2020-05-28-US-MTV-2_Pixel4', '2020-05-28-US-MTV-2_Pixel4XL',
        '2020-05-28-US-MTV-2_Pixel4XLModded', '2020-06-04-US-MTV-2_Pixel4',
        '2020-06-04-US-MTV-2_Pixel4XL',
        '2020-06-04-US-MTV-2_Pixel4XLModded', '2020-06-10-US-MTV-1_Pixel4',
        '2020-06-10-US-MTV-1_Pixel4XL',
        '2020-06-10-US-MTV-1_Pixel4XLModded', '2020-06-10-US-MTV-2_Pixel4',
        '2020-06-10-US-MTV-2_Pixel4XL',
        '2020-06-10-US-MTV-2_Pixel4XLModded', '2020-08-03-US-MTV-2_Mi8',
        '2020-08-03-US-MTV-2_Pixel4', '2020-08-03-US-MTV-2_Pixel4XL',
        '2020-08-13-US-MTV-1_Mi8', '2020-08-13-US-MTV-1_Pixel4',
        '2021-03-16-US-MTV-2_Pixel4Modded',
        '2021-03-16-US-MTV-2_SamsungS20Ultra',
        '2021-03-16-US-RWC-2_Pixel4XL', '2021-03-16-US-RWC-2_Pixel5',
        '2021-03-16-US-RWC-2_SamsungS20Ultra', '2021-03-25-US-PAO-1_