# 데이터 전처리 알고리즘
- 본 연구는 2019년 라이프로그 데이터를 대상으로 연구 및 실험 진행
- 데이터 구성은 아래와 같음
    - 유저명 디렉토리 
        - 타임스탬프 디렉토리 
            - 유저 라벨 정보.csv
            - 타임스탬프에서 측정된 센서값이 있는 디렉토리
                - 타임스탬프 별 센서 정보

# 필요 라이브러리 로드

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings( 'ignore' )

# Data load

## 2018년 사용자와 2019년 사용자의 개인정보 파일 로드
- 필요한 2019년 사용자의 정보만 추출

In [4]:
# user_info_2019_2018_updated.csv가 있는 경로를 설정
user_info_df = pd.read_csv('/workspace/data/user_info_2019_2018_updated.csv')
user_info_df.set_index('userId', inplace=True)
user_info_df.head()

Unnamed: 0_level_0,gender,age,height,weight,startDt,endDt
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
101,F,23,161.0,52.0,2020-01-01,2020-01-13
102,F,22,161.0,50.0,2019-12-09,2019-12-23
103,F,19,171.0,61.0,2020-01-04,2020-01-16
104,F,24,165.0,52.0,2019-12-04,2019-12-23
105,F,19,167.0,68.0,2019-12-04,2019-12-19


In [7]:
list(user_info_df.index[:20])

['101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120']

# 2019년 유저 정보를 추출하여 유저별로 데이터프레임을 생성 후 pickle로 저장

In [9]:
for user_name in tqdm_notebook(list(user_info_df.index[:20])):
    # 2019년 유저 데이터가 있는 파일 경ㄹ
    dataset_path = '/workspace/data/dataset_2019/'
    temp_user_path = dataset_path + user_name + '/'

    temp_user_labels = os.listdir(temp_user_path)

    prpDf = pd.concat([pd.read_csv(temp_user_path + temp_user_labels[j] + '/' + temp_user_labels[j] + '_label.csv') for j in range(len(temp_user_labels))], 0)

    prpDf.reset_index(inplace=True)

    prpDf['userName'] = [user_name for j in range(len(prpDf))]

    prpDf['gender'] = [user_info_df['gender'][prpDf['userName'][j]] for j in range(len(prpDf))]
    prpDf['height'] = [user_info_df['height'][prpDf['userName'][j]] for j in range(len(prpDf))]
    prpDf['weight'] = [user_info_df['weight'][prpDf['userName'][j]] for j in range(len(prpDf))]

    prpDf = prpDf[['userName', 'ts', 'emotionPositive', 'emotionTension', 'action', 'gender', 'height', 'weight']]

    prpDf['e4Acc'] = [[] for j in range(len(prpDf))]
    prpDf['e4Bvp'] = [np.array([]) for j in range(len(prpDf))]
    prpDf['e4Eda'] = [np.array([]) for j in range(len(prpDf))]
    prpDf['e4Hr'] = [np.array([]) for j in range(len(prpDf))]
    prpDf['e4Temp'] = [np.array([]) for j in range(len(prpDf))]

    prpDf['mAcc'] = [[] for j in range(len(prpDf))]
    prpDf['mGps'] = [[] for j in range(len(prpDf))]
    prpDf['mGyr'] = [[] for j in range(len(prpDf))]
    prpDf['mMag'] = [[] for j in range(len(prpDf))]

    prpDf.set_index('ts', inplace=True)

    tempE4AccDict = {}
    for i in tqdm_notebook(range(len(temp_user_labels))):
        e4Acc_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Acc')
        temp_datas = dict([(e4Acc[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Acc/' + e4Acc).values[:, 1:]) for e4Acc in e4Acc_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Acc'][int(td)] = temp_datas[td]
            except:
                continue

        e4Bvp_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Bvp')
        temp_datas = dict([(e4Bvp[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Bvp/' + e4Bvp).values[:, 1:]) for e4Bvp in e4Bvp_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Bvp'][int(td)] = temp_datas[td]
            except:
                continue

        e4Eda_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Eda')
        temp_datas = dict([(e4Eda[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Eda/' + e4Eda).values[:, 1:]) for e4Eda in e4Eda_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Eda'][int(td)] = temp_datas[td]
            except:
                continue

        e4Hr_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Hr')
        temp_datas = dict([(e4Hr[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Hr/' + e4Hr).values[:, 1:]) for e4Hr in e4Hr_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Hr'][int(td)] = temp_datas[td]
            except:
                continue

        e4Temp_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Temp')
        temp_datas = dict([(e4Temp[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Temp/' + e4Temp).values[:, 1:]) for e4Temp in e4Temp_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Temp'][int(td)] = temp_datas[td]
            except:
                continue

        mAcc_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mAcc')
        temp_datas = dict([(mAcc[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mAcc/' + mAcc).values[:, 1:]) for mAcc in mAcc_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mAcc'][int(td)] = temp_datas[td]
            except:
                continue

        mGps_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mGps')
        temp_datas = dict([(mGps[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mGps/' + mGps).values[:, 1:]) for mGps in mGps_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mGps'][int(td)] = temp_datas[td]
            except:
                continue

        mGyr_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mGyr')
        temp_datas = dict([(mGyr[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mGyr/' + mGyr).values[:, 1:]) for mGyr in mGyr_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mGyr'][int(td)] = temp_datas[td]
            except:
                continue

        mMag_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mMag')
        temp_datas = dict([(mMag[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mMag/' + mMag).values[:, 1:]) for mMag in mMag_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mMag'][int(td)] = temp_datas[td]
            except:
                continue

    prpDf['e4AccLen'] = prpDf['e4Acc'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4AccLen']==0].index, 0)
    prpDf['e4BvpLen'] = prpDf['e4Bvp'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4BvpLen']==0].index, 0)
    prpDf['e4EdaLen'] = prpDf['e4Eda'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4EdaLen']==0].index, 0)
    prpDf['e4HrLen'] = prpDf['e4Hr'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4HrLen']==0].index, 0)
    prpDf['e4TempLen'] = prpDf['e4Temp'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4TempLen']==0].index, 0)
    prpDf['mAccLen'] = prpDf['mAcc'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mAccLen']==0].index, 0)
    prpDf['mGpsLen'] = prpDf['mGps'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mGpsLen']==0].index, 0)
    prpDf['mGyrLen'] = prpDf['mGyr'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mGyrLen']==0].index, 0)
    prpDf['mMagLen'] = prpDf['mMag'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mMagLen']==0].index, 0)
    
    with open(user_name + '_prpDf.pickle', 'wb') as f:
        pickle.dump(prpDf, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]