In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Parkinson

/content/drive/MyDrive/Parkinson


### Settings

In [3]:
import os
import pandas as pd
import numpy as np
import random, re

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

In [31]:
# 데이터 경로 (Prep 수행 후)
DATASETPATH = './dataset'
DATA_CONTROL = DATASETPATH + '/Controls'  # '/content/drive/MyDrive/Parkinson/dataset/Controls'
DATA_PD = DATASETPATH + '/PD'             # '/content/drive/MyDrive/Parkinson/dataset/Controls'

# NULL 정보 저장 경로
SAVEPATH = os.path.join(os.getcwd(), 'NULL_info.csv')

In [24]:
MARKERS = ['LFHD', 'RFHD', 'LBHD', 'RBHD', 'C7', 'LSHO', 'RSHO', 'CLAV', 'RBAK',
           'LUPA', 'RUPA', 'STRN', 'T10', 'LELB', 'RELB', 'LFRM', 'RFRM',
           'LWRA', 'RWRA', 'LWRB', 'RWRB', 'LFIN', 'RFIN', 'LASI', 'RASI',
           'LPSI', 'RPSI', 'LTHI', 'RTHI', 'LKNE', 'RKNE', 'LTIB', 'RTIB',
           'LANK', 'RANK', 'LTOE', 'RTOE', 'LHEE', 'RHEE']
len(MARKERS)

39

### Find NULL data

In [34]:
def findNull(target_path):
    # target_path = './dataset/PD' or './dataset/Controls'
    
    # NULL 발견 시 저장할 정보 목록
    df_null = pd.DataFrame()
    paths, names, categories, trials = [], [], [], []  # 파일경로, 환자명, 소속, Trial
    markers, null_frames, frame_len, ratios, remove = [], [], [], [], []

    category = target_path.split('/')[-1]
    for filename in sorted(os.listdir(target_path)):
        # Exception
        if category == 'PD' and filename == 'KMS_FW3.csv': continue
        # if 'BHY' not in filename: continue

        path = os.path.join(target_path, filename)
        _, name, trial = filename[:-4].split('_') # PREP_{이니셜}_{FW/BW}{1~3}.csv

        df = pd.read_csv(path, index_col=0)
        for col in df.columns[1:]:
            if col[:-2] in markers: continue  # X,Y,Z 컬럼이 똑같이 결측이므로 셋 중 하나만 보면 됨
            
            if df[col].isna().sum() > 0:  # NULL 존재
                # 해당 csv 파일 정보 저장
                paths.append(path)
                names.append(name)
                categories.append(category)
                trials.append(trial)
                
                # NULL이 존재하는 마커 저장
                markers.append(col[:-2])
                
                # NULL인 부분의 Frame 저장
                null = df.loc[df[col].isna()].index.values

                # NULL인 부분의 프레임 범위를 이중리스트로 저장: [[시작, 끝], ..., [시작, 끝]]
                null = np.append(null, np.array([-1])) 
                # 마지막에 -1 원소 추가하는 이유: 밑에서 while문 돌릴 때 index error 방지

                def min_max_scaling(v: list):  # v = [start, end] or [point]
                    id_min, id_max = min(df.index), max(df.index)
                    return [ np.round((e - id_min) / (id_max - id_min), 4) for e in v]
                
                null_range = []
                i,j = 0,1
                while j < len(null):
                    while null[j] - null[j-1] == 1 and j < len(null)-1:
                        j += 1
                    if i == j-1: 
                        null_range.append(min_max_scaling([null[i]]))
                    else: 
                        null_range.append( min_max_scaling([null[i], null[j-1]]) )
                    i = j
                    j = i+1
                # sum(null_range, []) : 이중리스트 -> 1차원으로 변환
                

                # 해당 csv의 프레임 범위, 길이, 결측비율, 제거여부 저장
                # frame_ranges.append([min(df.index), max(df.index)])
                frame_len.append(len(df))
                ratio = round((len(null) - 1) / len(df), 4)
                null_frames.append(null_range)
                ratios.append( ratio )
                remove.append( ratio > 0.5 )


    df_null['path'] = paths
    df_null['category'] = categories
    df_null['name'] = names
    df_null['trial'] = trials
    df_null['marker'] = markers
    df_null['NULL frames'] = null_frames

    # df_null['full frame range'] = frame_ranges
    df_null['frame length'] = frame_len
    df_null['ratio'] = ratios
    df_null['remove'] = remove

    return df_null

In [35]:
df_null_Ctrl = findNull(DATA_CONTROL)
print(df_null_Ctrl.shape)
df_null_Ctrl.head()

(39, 9)


Unnamed: 0,path,category,name,trial,marker,NULL frames,frame length,ratio,remove
0,./dataset/Controls/PREP_BHY_BW3.csv,Controls,BHY,BW3,RBAK,"[[0.9608, 1.0]]",486,0.0412,False
1,./dataset/Controls/PREP_BHY_BW3.csv,Controls,BHY,BW3,RFRM,"[[0.9918, 0.9979]]",486,0.0082,False
2,./dataset/Controls/PREP_BHY_FW1.csv,Controls,BHY,FW1,LFRM,"[[0.917, 0.9965]]",290,0.0828,False
3,./dataset/Controls/PREP_BHY_FW2.csv,Controls,BHY,FW2,LUPA,[[0.9968]],316,0.0032,False
4,./dataset/Controls/PREP_CHH_BW3.csv,Controls,CHH,BW3,LWRA,"[[0.0, 0.003]]",662,0.0045,False


In [36]:
df_null_PD = findNull(DATA_PD)
print(df_null_PD.shape)
df_null_PD.head()

(39, 9)


Unnamed: 0,path,category,name,trial,marker,NULL frames,frame length,ratio,remove
0,./dataset/PD/PREP_AMJ_BW1.csv,PD,AMJ,BW1,LTIB,"[[0.9988, 1.0]]",847,0.0024,False
1,./dataset/PD/PREP_AMJ_BW3.csv,PD,AMJ,BW3,LWRB,"[[0.0, 0.0219]]",867,0.0231,False
2,./dataset/PD/PREP_AMJ_FW1.csv,PD,AMJ,FW1,LUPA,"[[0.0, 0.0045]]",441,0.0068,False
3,./dataset/PD/PREP_AMJ_FW2.csv,PD,AMJ,FW2,LTHI,"[[0.0, 0.0266]]",339,0.0295,False
4,./dataset/PD/PREP_AMJ_FW3.csv,PD,AMJ,FW3,LWRA,"[[0.9944, 1.0]]",355,0.0085,False


In [37]:
df_null = pd.concat([df_null_PD, df_null_Ctrl]).reset_index(drop=True)
df_null.to_csv(SAVEPATH)

### NULL Data Analysis

In [38]:
df_null['name'].value_counts()

BGH       33
ODS        9
SHI        9
AMJ        5
BHY        4
NTH        4
KES        3
ParkSD     3
HSH        2
JHY        2
BDY        1
CHH        1
JJG        1
KimYC      1
Name: name, dtype: int64

In [42]:
df_null.groupby(by=['name','trial'])[['remove']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,remove
name,trial,Unnamed: 2_level_1
AMJ,BW1,0
AMJ,BW3,0
AMJ,FW1,0
AMJ,FW2,0
AMJ,FW3,0
BDY,FW3,0
BGH,BW1,1
BGH,FW1,23
BHY,BW3,0
BHY,FW1,0


In [None]:
# df_null.groupby(by=['marker']).apply(lambda x: x)

- null 부분 제거하는 함수
- x,y,z 채널(3d tensor) 변환 -> 이미지로 저장
   
- flow_from_directory \
  target_size = (통일한 frame 수, 39)
- batch size
- CNN 모델 관련 \
  Conv2D layer 수, filter size, stride, optimizer, lr, loss

- 코드 템플릿: TF 자격증 - Part 1 - Week 3,4