## 1. Raw data Info

In [1]:
import os
import glob

RAWDATAPATH = "./FW&BW_Rawdata/"
RAW_CONTROL = os.path.join(RAWDATAPATH, "Controls")
RAW_PD = os.path.join(RAWDATAPATH, "PD")

In [2]:
# RAW 데이터 확인

files = glob.glob(os.path.join(RAW_CONTROL, "*.csv"))
fnames = set([ f.split('_')[1] for f in files ])
print("Controls:", len(fnames))

files = glob.glob(os.path.join(RAW_PD, "*.csv"))
fnames = set([ f.split('_')[1] for f in files ])
print("      PD:", len(fnames))

Controls: 22
      PD: 83


## 2. Prep

In [3]:
import os
import glob
import pandas as pd

DEBUG = False

DATASETPATH = './dataset_/'
DATA_CONTROL = os.path.join(DATASETPATH, 'Controls')
DATA_PD = os.path.join(DATASETPATH, 'PD')

if not os.path.exists(DATASETPATH): os.mkdir(DATASETPATH)

In [4]:
def generatePrepData(rawdatapath):
    
    def prepData(target_path, target_file, ptype):
        if DEBUG: 
            print("[ * ] prepData:", target_file)

        # 1. 2줄 제외하고 읽기

        dff = pd.read_csv(target_path, skiprows=2, encoding='utf-8')

        # 2. 전체 none인 컬럼 제외

        while dff[list(dff.columns)[-1]].isna().all():
            dff.drop(list(dff.columns)[-1], axis=1, inplace=True)

        # 3. 단위가 mm인 데이터만 추출

        target = [0, 1]
        target += list(filter(lambda x: list(dff.iloc[1] == 'mm')[x],
                              range(len(list(dff.iloc[1])))))
        dff = dff.iloc[:, target]
        dff.drop(dff.index[1], inplace=True)

        # 4. 컬럼 이름 변경하기

        pList = dff.columns
        colList = dff.iloc[0]

        NAME = None

        pNames = []
        for p in pList:
            if not p.startswith('Unnamed'):
                if NAME is None: 
                    NAME = p.split(':')[0]
                pNames.append(p.split(':')[1])

        columns = []
        columns.append(colList[0])
        columns.append(colList[1])
        for idx, col in enumerate(colList[2:]):
            i = int(idx / 3)
            eachColName = pNames[i] + '_' + col
            columns.append(eachColName)

        assert len(columns) == len(list(dff.columns))

        dff.drop(dff.index[0], inplace=True)
        dff.reset_index(drop=True, inplace=True)

        dff.columns = columns

        if not os.path.exists(DATASETPATH + ptype):
            os.mkdir(DATASETPATH + ptype)

        dff.to_csv(os.path.join(os.path.join(DATASETPATH, ptype),
                                'PREP_' + target_file),
                   encoding='utf-8', index=False)

    if DEBUG: 
        print("[ * ] runPrepData")

    for f in os.listdir(rawdatapath):
        if f.endswith('.csv'):
            prepData(os.path.join(rawdatapath, f),
                     f,
                     rawdatapath.split('/')[-1])

In [5]:
generatePrepData(RAW_CONTROL)
generatePrepData(RAW_PD)

In [6]:
# 데이터 확인

files = glob.glob(os.path.join(DATA_CONTROL, "*.csv"))
fnames = set([ f.split('_')[2] for f in files ])
print('Controls:', len(fnames))

files = glob.glob(os.path.join(DATA_PD, "*.csv"))
fnames = set([ f.split('_')[2] for f in files ])
print("      PD:", len(fnames))

Controls: 22
      PD: 83


## 3. 환자별 데이터 테이블 생성

In [7]:
import os
import glob

DATASETPATH = './dataset_/'
DATA_CONTROL = os.path.join(DATASETPATH , 'Controls')
DATA_PD = os.path.join(DATASETPATH, 'PD') 

In [10]:
def generatePatientsInfoTable():
    
    def getPatientDataInfo(target_cate):
    
        dff = pd.DataFrame(columns=["Patient", "Category", "cntFW", "cntBW"])

        TARGET_DATA = DATA_CONTROL

        if target_cate == "PD":
            TARGET_DATA = DATA_PD

        files = glob.glob(os.path.join(TARGET_DATA, "*.csv"))
        fnames = list(set([ f.split('/')[-1].split('_')[1] for f in files ]))
        fnames.sort()

        for name in fnames:

            cnt_bw = len(glob.glob(os.path.join(TARGET_DATA, f"*{name}_BW*.csv")))
            cnt_fw = len(glob.glob(os.path.join(TARGET_DATA, f"*{name}_FW*.csv")))

            dff.loc[len(dff)] = [ name, target_cate, cnt_fw, cnt_bw ]

        return dff

    df = pd.DataFrame( columns=["Patient", "Category", "cntFW", "cntBW"])
    
    df = df.append(getPatientDataInfo("Controls"), ignore_index=True)
    df = df.append(getPatientDataInfo("PD"), ignore_index=True)
    
    df.to_csv(os.path.join(DATASETPATH, 'patients.csv'), encoding='utf-8', index=False)    

In [11]:
generatePatientsInfoTable()