## create dataset 

v2:lightGBM用delta学習用のデータセットを作成する 

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
root_dir = '../input/'

In [3]:
import pickle
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
def from_pickle(filename):
    with open(filename, mode='rb') as f:
        obj = pickle.load(f)
    return obj

## 1段階目(ホストのコードでtxt2dfする)

In [4]:
output_train_dir = root_dir + 'imu_dataset_v0/train/'
os.makedirs(output_train_dir, exist_ok=True)

In [5]:
output_test_dir = root_dir + 'imu_dataset_v0/test/'
os.makedirs(output_test_dir, exist_ok=True)

In [6]:
def gnss_log_to_dataframes(path):
    # print('\nLoading ' + path, flush=True)
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for i, dataline in enumerate(datalines):
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            gnss_map[dataline[0]] = dataline[1:]
        elif not is_header:
            datas[dataline[0]].append(dataline[1:])
    
    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            results[k][col] = pd.to_numeric(results[k][col])

    return results

In [7]:
def add_name(df, collection_name, phone_name):
    df['collectionName'] = collection_name
    df['phoneName'] = phone_name
    return df

In [8]:
def create_imu_base_dataset(path):
    raw_df = pd.DataFrame()
    acc_df = pd.DataFrame()
    gyro_df = pd.DataFrame()
    mag_df = pd.DataFrame()
    fix_df = pd.DataFrame()
    status_df = pd.DataFrame()
    orient_df = pd.DataFrame()

    collection_name = path.split('/')[3]
    phone_name = path.split('/')[4]

    # get GnssLog file 
    for file_path in glob.glob(os.path.join(path, f"{phone_name}_GnssLog.txt")):
        result_dict = gnss_log_to_dataframes(file_path)
        raw_df  = pd.concat([raw_df, result_dict['Raw']])
        acc_df  = pd.concat([acc_df, result_dict['UncalAccel']])
        gyro_df  = pd.concat([gyro_df, result_dict['UncalGyro']])
        mag_df  = pd.concat([mag_df, result_dict['UncalMag']])
        fix_df  = pd.concat([fix_df, result_dict['Fix']])
        status_df  = pd.concat([status_df, result_dict['Status']])
        orient_df  = pd.concat([orient_df, result_dict['OrientationDeg']])
    
    raw_df = add_name(raw_df, collection_name, phone_name)
    acc_df = add_name(acc_df, collection_name, phone_name)
    gyro_df = add_name(gyro_df, collection_name, phone_name)
    mag_df = add_name(mag_df, collection_name, phone_name)
    fix_df = add_name(fix_df, collection_name, phone_name)
    status_df = add_name(status_df, collection_name, phone_name)
    orient_df = add_name(orient_df, collection_name, phone_name)

    return (raw_df, acc_df, gyro_df, mag_df, fix_df, status_df, orient_df)


In [9]:
# train
import multiprocessing
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    path_list = glob.glob(os.path.join(root_dir, 'train/*/*'), recursive=True)
    results = pool.imap_unordered(create_imu_base_dataset, path_list)
    results = list(tqdm(results, total=len(path_list)))


all_raw_df = pd.DataFrame()
all_acc_df = pd.DataFrame()
all_gyro_df = pd.DataFrame()
all_mag_df = pd.DataFrame()
all_fix_df = pd.DataFrame()
all_status_df = pd.DataFrame()
all_orient_df = pd.DataFrame()

for result in tqdm(results):
    raw_df, acc_df, gyro_df, mag_df, fix_df, status_df, orient_df = result
    all_raw_df = pd.concat([all_raw_df, raw_df]).reset_index(drop=True)
    all_acc_df = pd.concat([all_acc_df, acc_df]).reset_index(drop=True)
    all_gyro_df = pd.concat([all_gyro_df, gyro_df]).reset_index(drop=True)
    all_mag_df = pd.concat([all_mag_df, mag_df]).reset_index(drop=True)
    all_fix_df = pd.concat([all_fix_df, fix_df]).reset_index(drop=True)
    all_status_df = pd.concat([all_status_df, status_df]).reset_index(drop=True)
    all_orient_df = pd.concat([all_orient_df, orient_df]).reset_index(drop=True)

to_pickle(output_train_dir + 'raw.pkl', all_raw_df)
to_pickle(output_train_dir + 'acc.pkl', all_acc_df)
to_pickle(output_train_dir + 'gyro.pkl', all_gyro_df)
to_pickle(output_train_dir + 'mag.pkl', all_mag_df)
to_pickle(output_train_dir + 'fix.pkl', all_fix_df)
to_pickle(output_train_dir + 'status.pkl', all_status_df)

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

In [10]:
# test
import multiprocessing
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    path_list = glob.glob(os.path.join(root_dir, 'test/*/*'), recursive=True)
    results = pool.imap_unordered(create_imu_base_dataset, path_list)
    results = list(tqdm(results, total=len(path_list)))


all_raw_df = pd.DataFrame()
all_acc_df = pd.DataFrame()
all_gyro_df = pd.DataFrame()
all_mag_df = pd.DataFrame()
all_fix_df = pd.DataFrame()
all_status_df = pd.DataFrame()
all_orient_df = pd.DataFrame()

for result in tqdm(results):
    raw_df, acc_df, gyro_df, mag_df, fix_df, status_df, orient_df = result
    all_raw_df = pd.concat([all_raw_df, raw_df]).reset_index(drop=True)
    all_acc_df = pd.concat([all_acc_df, acc_df]).reset_index(drop=True)
    all_gyro_df = pd.concat([all_gyro_df, gyro_df]).reset_index(drop=True)
    all_mag_df = pd.concat([all_mag_df, mag_df]).reset_index(drop=True)
    all_fix_df = pd.concat([all_fix_df, fix_df]).reset_index(drop=True)
    all_status_df = pd.concat([all_status_df, status_df]).reset_index(drop=True)
    all_orient_df = pd.concat([all_orient_df, orient_df]).reset_index(drop=True)

to_pickle(output_test_dir + 'raw.pkl', all_raw_df)
to_pickle(output_test_dir + 'acc.pkl', all_acc_df)
to_pickle(output_test_dir + 'gyro.pkl', all_gyro_df)
to_pickle(output_test_dir + 'mag.pkl', all_mag_df)
to_pickle(output_test_dir + 'fix.pkl', all_fix_df)
to_pickle(output_test_dir + 'status.pkl', all_status_df)

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]