# 데이터 전처리 알고리즘
- 본 연구는 2019년 라이프로그 데이터를 대상으로 연구 및 실험 진행
- 데이터 구성은 아래와 같음
    - 유저명 디렉토리 
        - 타임스탬프 디렉토리 
            - 유저 라벨 정보.csv
            - 타임스탬프에서 측정된 센서값이 있는 디렉토리
                - 타임스탬프 별 센서 정보

## User별 Time-Series data 전처리


### 필요 라이브러리 로드

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import re 
import warnings
warnings.filterwarnings( 'ignore' )

### 시간대 별 특징을 추출하기 위한 함수 정의

1초 기준으로 Max값을 추출하여 정리하였음
무분별한 시간대 정리, 통일화. 


In [None]:
import datetime
def group_sec(data , freq):
    
    data['datetime'] =  pd.to_datetime(data['timestamp'], unit='s')
    
                
    #data['datetime'] = pd.to_timedelta(data['timestamp'], unit='s')
    df = data.set_index('datetime')
    
    ten_sec_df = df.groupby(pd.Grouper(freq=freq)).agg('max')
    ten_sec_df = ten_sec_df.interpolate()

    ten_sec_df = ten_sec_df.reset_index()
    return ten_sec_df

In [None]:
pre_pre_dir = '/workspace/data/dataset_2019/'
for d in os.listdir(pre_pre_dir):

    pre_dir = pre_pre_dir + d +"/"
    lst_timestamp = os.listdir(pre_dir)
    # print(d)

    for ts in lst_timestamp:
        time_pre_dir = pre_dir + ts + '/'
        for ma in os.listdir(time_pre_dir):
            if ma[-4] == '.':
                continue
            t = time_pre_dir+ma+'/'
            df = pd.DataFrame()
            for detail_timestamp in os.listdir(t):
                file_dir = t + detail_timestamp

                tmp_data = pd.read_csv(file_dir)
                tmp_data['timestamp'] += int(detail_timestamp.replace('.csv',''))

                tmp_data = group_sec(tmp_data,'5s')
                tmp_data.rename(columns = lambda x: ma +"__"+ x, inplace = True)

                #print(tmp_data.shape)
                df = pd.concat([df,tmp_data],ignore_index=True)
            df.to_csv(time_pre_dir+ma+'.csv')


### 전처리한 데이터를 바탕으로 모델에 맞는 데이터셋 구축

In [None]:
import os
import pandas as pd 
import numpy as np

ppre_dir = '/workspace/data/dataset_2019/'

for filename in os.listdir(ppre_dir):
    
    _pre_dir = ppre_dir + filename+'/'
    df = pd.DataFrame()
    # print(_pre_dir)
    for d in os.listdir(_pre_dir):
        cnt = 0 
        pre_dir = _pre_dir + d + '/'
        check = False
        for file in os.listdir(pre_dir):
            if  ('.' not in file )or ('label' in file ):
                continue
            #print(file)
            if cnt == 0 :
                cnt +=1 
                t_df = pd.read_csv(pre_dir + file,index_col=0)
                if t_df.shape[0] <= 0 :
                    break
                t_df.rename(columns={ file.replace('.csv','')+'__datetime':'datetime'}, inplace = True)

                t_df['datetime'] = pd.to_datetime(t_df['datetime']).astype(str)
                t_df =t_df.drop(columns = [file.replace('.csv','')+'__timestamp'])
                #e4Acc__timestamp
                continue
            tmp_df = pd.read_csv(pre_dir + file,index_col=0)
            if tmp_df.shape[0] <= 0 :
                break        
            word = file.replace('.csv','')+'__timestamp'
            tmp_df =tmp_df.drop(columns = [file.replace('.csv','')+'__timestamp'])

            tmp_df.rename(columns={ file.replace('.csv','')+'__datetime':'datetime'}, inplace = True)
            tmp_df['datetime'] = pd.to_datetime(tmp_df['datetime']).astype(str)
            t_df = pd.merge(t_df, tmp_df, on=['datetime'], how = 'inner')
        df = pd.concat([df,t_df])
    df = df.set_index(['datetime'])
    df = df.sort_index()
    lst = os.listdir(_pre_dir)
    concat_label = pd.DataFrame()
    for f in lst:
        
        file_path = f'{_pre_dir}{f}/{f}_label.csv'
        label = pd.read_csv(file_path)
        if label.shape[0] <= 0 :
            continue
        label['datetime'] =  pd.to_datetime(label['ts'], unit='s')

        label = label.set_index(['datetime'])
        label = label[~label.index.duplicated()]
        label_5s = label.resample('5s').ffill()

        concat_label = pd.concat([concat_label,label_5s])
    concat_label = concat_label.sort_index()
    concat_label = concat_label.drop(columns = ['ts'])
    df = df.reset_index()
    concat_label = concat_label.reset_index()
    df['datetime'] = pd.to_datetime(df['datetime'])
    concat_label['datetime'] = pd.to_datetime(concat_label['datetime'])
    merged_df = pd.merge(df, concat_label, on='datetime', how='inner')
    merged_df.fillna(0)
    merged_df = merged_df.set_index('datetime')
    merged_df.to_csv(f'processed_data/tmp_merged_{filename}.csv')

## Data load

### 2019년 사용자 개인정보 파일 로드
- 이전 과정에서 전처리한 데이터를 Load
- 필요한 2019년 사용자의 정보만 추출

In [4]:
# user_info_2019_2018_updated.csv가 있는 경로를 설정
user_info_df = pd.read_csv('data_dir')
user_info_df.set_index('userId', inplace=True)
user_info_df.head()

Unnamed: 0_level_0,gender,age,height,weight,startDt,endDt
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
101,F,23,161.0,52.0,2020-01-01,2020-01-13
102,F,22,161.0,50.0,2019-12-09,2019-12-23
103,F,19,171.0,61.0,2020-01-04,2020-01-16
104,F,24,165.0,52.0,2019-12-04,2019-12-23
105,F,19,167.0,68.0,2019-12-04,2019-12-19


#### Processed 된 Time-Series_data데이터를 바탕으로 Model에 맞도록 모달리티 별 차원 조절

In [9]:
for user_name in tqdm_notebook(list(user_info_df.index[:20])):
    # 2019년 유저 데이터가 있는 파일 경ㄹ
    dataset_path = '/workspace/data/processed_data/'
    temp_user_path = dataset_path + user_name + '/'

    temp_user_labels = os.listdir(temp_user_path)

    prpDf = pd.concat([pd.read_csv(temp_user_path + temp_user_labels[j] + '/' + temp_user_labels[j] + '_label.csv') for j in range(len(temp_user_labels))], 0)

    prpDf.reset_index(inplace=True)

    prpDf['userName'] = [user_name for j in range(len(prpDf))]

    prpDf['gender'] = [user_info_df['gender'][prpDf['userName'][j]] for j in range(len(prpDf))]
    prpDf['height'] = [user_info_df['height'][prpDf['userName'][j]] for j in range(len(prpDf))]
    prpDf['weight'] = [user_info_df['weight'][prpDf['userName'][j]] for j in range(len(prpDf))]

    prpDf = prpDf[['userName', 'ts', 'emotionPositive', 'emotionTension', 'action', 'gender', 'height', 'weight']]

    prpDf['e4Acc'] = [[] for j in range(len(prpDf))]
    prpDf['e4Bvp'] = [np.array([]) for j in range(len(prpDf))]
    prpDf['e4Eda'] = [np.array([]) for j in range(len(prpDf))]
    prpDf['e4Hr'] = [np.array([]) for j in range(len(prpDf))]
    prpDf['e4Temp'] = [np.array([]) for j in range(len(prpDf))]

    prpDf['mAcc'] = [[] for j in range(len(prpDf))]
    prpDf['mGps'] = [[] for j in range(len(prpDf))]
    prpDf['mGyr'] = [[] for j in range(len(prpDf))]
    prpDf['mMag'] = [[] for j in range(len(prpDf))]

    prpDf.set_index('ts', inplace=True)

    tempE4AccDict = {}
    for i in tqdm_notebook(range(len(temp_user_labels))):
        e4Acc_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Acc')
        temp_datas = dict([(e4Acc[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Acc/' + e4Acc).values[:, 1:]) for e4Acc in e4Acc_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Acc'][int(td)] = temp_datas[td]
            except:
                continue

        e4Bvp_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Bvp')
        temp_datas = dict([(e4Bvp[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Bvp/' + e4Bvp).values[:, 1:]) for e4Bvp in e4Bvp_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Bvp'][int(td)] = temp_datas[td]
            except:
                continue

        e4Eda_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Eda')
        temp_datas = dict([(e4Eda[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Eda/' + e4Eda).values[:, 1:]) for e4Eda in e4Eda_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Eda'][int(td)] = temp_datas[td]
            except:
                continue

        e4Hr_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Hr')
        temp_datas = dict([(e4Hr[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Hr/' + e4Hr).values[:, 1:]) for e4Hr in e4Hr_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Hr'][int(td)] = temp_datas[td]
            except:
                continue

        e4Temp_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'e4Temp')
        temp_datas = dict([(e4Temp[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'e4Temp/' + e4Temp).values[:, 1:]) for e4Temp in e4Temp_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['e4Temp'][int(td)] = temp_datas[td]
            except:
                continue

        mAcc_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mAcc')
        temp_datas = dict([(mAcc[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mAcc/' + mAcc).values[:, 1:]) for mAcc in mAcc_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mAcc'][int(td)] = temp_datas[td]
            except:
                continue

        mGps_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mGps')
        temp_datas = dict([(mGps[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mGps/' + mGps).values[:, 1:]) for mGps in mGps_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mGps'][int(td)] = temp_datas[td]
            except:
                continue

        mGyr_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mGyr')
        temp_datas = dict([(mGyr[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mGyr/' + mGyr).values[:, 1:]) for mGyr in mGyr_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mGyr'][int(td)] = temp_datas[td]
            except:
                continue

        mMag_temp_file = os.listdir(temp_user_path + temp_user_labels[i] + '/' + 'mMag')
        temp_datas = dict([(mMag[:-4], pd.read_csv(temp_user_path + temp_user_labels[i] + '/' + 'mMag/' + mMag).values[:, 1:]) for mMag in mMag_temp_file])
        for td in list(temp_datas.keys()):
            try:
                prpDf['mMag'][int(td)] = temp_datas[td]
            except:
                continue

    prpDf['e4AccLen'] = prpDf['e4Acc'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4AccLen']==0].index, 0)
    prpDf['e4BvpLen'] = prpDf['e4Bvp'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4BvpLen']==0].index, 0)
    prpDf['e4EdaLen'] = prpDf['e4Eda'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4EdaLen']==0].index, 0)
    prpDf['e4HrLen'] = prpDf['e4Hr'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4HrLen']==0].index, 0)
    prpDf['e4TempLen'] = prpDf['e4Temp'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['e4TempLen']==0].index, 0)
    prpDf['mAccLen'] = prpDf['mAcc'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mAccLen']==0].index, 0)
    prpDf['mGpsLen'] = prpDf['mGps'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mGpsLen']==0].index, 0)
    prpDf['mGyrLen'] = prpDf['mGyr'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mGyrLen']==0].index, 0)
    prpDf['mMagLen'] = prpDf['mMag'].map(len)
    prpDf = prpDf.drop(prpDf[prpDf['mMagLen']==0].index, 0)
    
    with open(user_name + '_prpDf.pickle', 'wb') as f:
        pickle.dump(prpDf, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]