In [6]:
import numpy as np
import pandas as pd
import os 
import sys
import time
import matplotlib.pyplot as plt

# 벤다이어그램용
%matplotlib inline
import venn

In [7]:
class DataPreProcess:
    
    def __init__(self, region):
        self.path = os.getcwd()[:-5] + f'\\region\\{region}' #'\\region\\{}'.format(region) 
        self.region = region

    def makeDir(self):
        self.newPath = os.path.join(f'{self.path}\\PreprocessData') 
         
        if not os.path.exists(self.newPath):
            os.makedirs(os.path.join(f'{self.path}\\PreprocessData'))
        else:
            print('이미 존재합니다.')
        
        # 0_ 은 파일리스트 뽑을 때 commonPatientID 디렉토리를 맨 앞에 두기 위해서 해놓음.
    
    def dataSet(self):
        #filelist 부분을 합침
        self.fileList = os.listdir(self.path)[1:] 
        file_list = []
        for filename in self.fileList:
            # 파일명이 문자열을 포함하는지 확인
            if self.region in filename:
                file_list.append(filename)
        self.fileList = file_list
        self.dataDict = {}
        for i in self.fileList:
            self.dataDict[i] = pd.read_csv(self.path+'\\{}'.format(i), encoding = 'utf-16', index_col = 0)
        print('All files are added')
        return self.dataDict
    
    # Tmedication에는 환자정보와 시간이 없어서 만들기위해 TMedicalRecord와 MedicalRecordID를 key로서 사용하여 연결
    def Tmedication(self):
        
        #2024 버전의 데이터에 맞춰 변경
        temp = self.dataDict[f'{self.region}_tmedicalrecord.csv'][['MedicalRecordID','PatientID','ConsultTime']]

        
        #저장하는 형식으로 코드 수정
        self.dataDict['{}_tmedication.csv'.format(self.region)] = \
            pd.merge(temp,self.dataDict['{}_tmedication.csv'.format(self.region)], how = 'inner', on = 'MedicalRecordID')

        
        
    def patientChartNo(self):
        self.dataDict[f'{self.region}_tpatientpersonal.csv'] = \
        self.dataDict[f'{self.region}_tpatientpersonal.csv'].loc[self.dataDict[f'{self.region}_tpatientpersonal.csv']['PatientChartNo'].notnull()]

    def countUniquePatientID(self):
        for i in self.dataDict.keys():
            sheetPatientIDSet = self.dataDict[i]['PatientID']
            print(f'{i} sheetPatientIDSet,{len(sheetPatientIDSet)}')
            sheetPatientIDSet = set(self.dataDict[i]['PatientID'])
            print(f'{i} sheetPatientIDSet,{len(sheetPatientIDSet)}')

    def vennDiagram(self):
        self.inbodySet        = set(self.dataDict['{}_tinbodyadditionaldata.csv'.format(self.region)]['PatientID'])
        self.privateSet       = set(self.dataDict[f'{self.region}_tpatientpersonal.csv']['PatientID'])
        self.medicationSet    = set(self.dataDict['{}_tmedication.csv'.format(self.region)]['PatientID'])
        self.medicalRecordSet = set(self.dataDict['{}_tmedicalrecord.csv'.format(self.region)]['PatientID'])
        self.vitalTempSet     = set(self.dataDict['{}_tpatientvitaltemp.csv'.format(self.region)]['PatientID'])
    
        self.labels = venn.get_labels([self.inbodySet,self.privateSet,self.medicationSet,self.medicalRecordSet,self.vitalTempSet])
        # augument 로 fill을 안써도 되는구나 ( pilot 보면 여기에 써놓음)
        plt.figure(figsize=(12,8))
        fig,ax = venn.venn5(self.labels,names = ['inbody','private','medication','medicalRecord','vitalTemp'])
        plt.title(f'{self.region}')
        plt.show()
        plt.close()

    def makeInbody(self):
        #파일 리스트를 만들고 문제가 있는 파일 제거
        #impedence와 measurement는 plat작업이 필요
        #obestity는 인덱스 에러(?)
        inbody_file_names = self.fileList[0:14]
        inbody_file_names.remove(f'{self.region}_tinbodyimpedence.csv')
        inbody_file_names.remove(f'{self.region}_tinbodymeasurement.csv')
        inbody_file_names.remove(f'{self.region}_tinbodyobesitydiagnosis.csv')

        for i, name in enumerate(inbody_file_names):
            df = self.dataDict[name]
            df['MeasureDate'] = pd.to_datetime(df['MeasureDate'], format='%Y%m%d%H%M%S')
            df = df.sort_values(by=['PatientID', 'MeasureDate']).reset_index(drop=True)
            print()
            print(i, name, len(df))
            if i == 0 :
                inbody_total_df = df
        
            else:
                df.drop(columns=['ReadingID'], inplace=True)
                df_columns = set(df.columns)
                total_columns = set(inbody_total_df.columns)
                common_feature = list(df_columns & total_columns)
                print(common_feature)
                inbody_total_df =pd.merge(inbody_total_df, df, on = common_feature)
                print(len(inbody_total_df))
        print(inbody_total_df.shape)
        print(inbody_total_df.head())

        #measurment part
        df = self.dataDict[f'{self.region}_tinbodymeasurement.csv']
        df['MeasureDate'] = pd.to_datetime(df['MeasureDate'], format='%Y%m%d%H%M%S')
        df = df.sort_values(by=['PatientID', 'MeasureDate']).reset_index(drop=True)
        df = df.iloc[:,1:12]

        #measurment feature 생성
        feature_names = df.columns[3:]
        neck_feature_names = ['neck_'+ name for name in feature_names]
        chest_feature_names = ['chest_'+ name for name in feature_names]
        abdomen_feature_names = ['abdomen_'+ name for name in feature_names]
        hip_feature_names = ['hip_'+ name for name in feature_names]
        Larm_feature_names = ['Larm_'+ name for name in feature_names]
        Rarm_feature_names = ['Rarm_'+ name for name in feature_names]
        Lleg_feature_names = ['Lleg_'+ name for name in feature_names]
        Rleg_feature_names = ['Rleg_'+ name for name in feature_names]

        for i in range(len(df)):
            if i % 8 == 0:
                imsi_dict = {'PatientID': df.loc[0,'PatientID'], 'MeasureDate' : df.loc[0,'MeasureDate']}
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {neck_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
            elif i % 8 == 1:
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {chest_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
            elif i % 8 == 2:
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {abdomen_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
            elif i % 8 == 3:
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {hip_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
            elif i % 8 == 4:
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {Larm_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
            elif i % 8 == 5:
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {Rarm_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
            elif i % 8 == 6:
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {Lleg_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
            elif i % 8 == 7:
                imsi_list=list(df.loc[i][3:])
                imsi_dict2 = {Rleg_feature_names[i]:[imsi_list[i]] for i in range(8)}
                imsi_dict.update(imsi_dict2)
                
            if i < 7:
                continue
            elif i == 7:
                mesurment_df = pd.DataFrame(imsi_dict)
            elif i % 8 == 7:
                imsi_df = pd.DataFrame(imsi_dict)
                mesurment_df = pd.concat([mesurment_df, imsi_df])
            mesurment_df.reset_index(inplace=True, drop=True)
            
        print(mesurment_df.shape)
        print(mesurment_df.head())

        #병합
        inbody_total_df =pd.concat([inbody_total_df,mesurment_df.iloc[:,2:]], axis =1)

        #impedence part
        df = self.dataDict[f'{self.region}_tinbodyimpedence.csv']
        df['MeasureDate'] = pd.to_datetime(df['MeasureDate'], format='%Y%m%d%H%M%S')
        df = df.sort_values(by=['PatientID', 'MeasureDate']).reset_index(drop=True)

        #freq 기준으로 feature 만듬
        feature_names = df.columns[4:9]
        feature_names1 = ['1_'+ name for name in feature_names]
        feature_names5 = ['5_'+ name for name in feature_names]
        feature_names50 = ['50_'+ name for name in feature_names]
        feature_names250 = ['250_'+ name for name in feature_names]
        feature_names500 = ['500_'+ name for name in feature_names]
        feature_names1000 = ['1000_'+ name for name in feature_names]

        for i in range(len(df)):
            if i % 6 == 0:
                imsi_dict = {'PatientID': df.loc[0,'PatientID'], 'MeasureDate' : df.loc[0,'MeasureDate']}
                imsi_list=list(df.loc[i][4:9])
                imsi_dict2 = {feature_names1[i]:[imsi_list[i]] for i in range(5)}
                imsi_dict.update(imsi_dict2)
            elif i % 6 == 1:
                imsi_list=list(df.loc[i][4:9])
                imsi_dict2 = {feature_names5[i]:[imsi_list[i]] for i in range(5)}
                imsi_dict.update(imsi_dict2)
            elif i % 6 == 2:
                imsi_list=list(df.loc[i][4:9])
                imsi_dict2 = {feature_names50[i]:[imsi_list[i]] for i in range(5)}
                imsi_dict.update(imsi_dict2)
            elif i % 6 == 3:
                imsi_list=list(df.loc[i][4:9])
                imsi_dict2 = {feature_names250[i]:[imsi_list[i]] for i in range(5)}
                imsi_dict.update(imsi_dict2)
            elif i % 6 == 4:
                imsi_list=list(df.loc[i][4:9])
                imsi_dict2 = {feature_names500[i]:[imsi_list[i]] for i in range(5)}
                imsi_dict.update(imsi_dict2)
            elif i % 6 == 5:
                imsi_list=list(df.loc[i][4:9])
                imsi_dict2 = {feature_names1000[i]:[imsi_list[i]] for i in range(5)}
                imsi_dict.update(imsi_dict2)
                
            if i < 5:
                continue
            elif i == 5:
                impedence_df = pd.DataFrame(imsi_dict)
            elif i % 6 == 5:
                imsi_df = pd.DataFrame(imsi_dict)
                impedence_df = pd.concat([impedence_df, imsi_df])
        
        impedence_df.reset_index(inplace=True, drop=True)
        print(impedence_df.shape)
        print(impedence_df.head())
        inbody_total_df = pd.concat([inbody_total_df, impedence_df.iloc[:,2:]], axis = 1)

        #obesitydiagnosis part
        df = self.dataDict[f'{self.region}_tinbodyobesitydiagnosis.csv']
        df['MeasureDate'] = pd.to_datetime(df['MeasureDate'], format='%Y%m%d%H%M%S')
        df = df.sort_values(by=['PatientID', 'MeasureDate']).reset_index(drop=True)
        df.drop(columns=['ReadingID'], inplace=True)
        df = df.drop('PSMM', axis=1)
        df = df.drop('PWeight', axis=1)
        df = df.drop('Weight', axis=1)
        df = df.drop('ReadingID_ORG', axis=1)

        inbody_total_df = pd.merge(inbody_total_df, df, on = ['MeasureDate', 'PatientID'])
        print(inbody_total_df.shape)
        print(inbody_total_df.head())

        self.inbody_total_df = inbody_total_df
        return(self.inbody_total_df)


    def makeMedical(self):
        #데이터 업로드
        medrec_df = self.dataDict[f'{self.region}_tmedicalrecord.csv']
        medication_df =self.dataDict[f'{self.region}_tmedication.csv']
        #시간 조정
        medrec_df['ConsultTime'] = pd.to_datetime(medrec_df['ConsultTime'], format='%Y%m%d%H%M%S')
        medrec_df = medrec_df.sort_values(by=['PatientID', 'ConsultTime']).reset_index(drop=True)
        #주요변수 리스트
        medication_group= medication_df.groupby('MedicalRecordID')
        MedicineName_vec = medication_group.apply(lambda x: list(x['MedicineName']))
        code_vec = medication_group.apply(lambda x: list(x['MedicineCode']))
        Memo_vec = medication_group.apply(lambda x: list(x['Memo']))
        combine_df = pd.concat([MedicineName_vec,code_vec,Memo_vec], axis =1)
        #나머지 변수 데이터프레임화
        imsi_df = medication_group.first()
        imsi_df.drop('MedicineCode', axis = 1, inplace = True)
        imsi_df.drop('MedicineName', axis = 1, inplace = True)
        imsi_df.drop('Memo', axis = 1, inplace = True)
        #데이터 병합
        imsi_df =pd.merge(combine_df, imsi_df, on ='MedicalRecordID')
        imsi_df=imsi_df.reset_index() #인덱스 재설정
        medication_df = imsi_df.rename(columns={0: 'MedicineName',1:'MedicineCode', 2:'Memo'})
        self.medical_df = pd.merge(medrec_df, medication_df, on = 'MedicalRecordID', how= 'outer')

        return self.medical_df

In [8]:
gangnam = DataPreProcess(region = 'gangnam')
gangnam.dataSet()

  self.dataDict[i] = pd.read_csv(self.path+'\\{}'.format(i), encoding = 'utf-16', index_col = 0)
  self.dataDict[i] = pd.read_csv(self.path+'\\{}'.format(i), encoding = 'utf-16', index_col = 0)
  self.dataDict[i] = pd.read_csv(self.path+'\\{}'.format(i), encoding = 'utf-16', index_col = 0)


All files are added


{'gangnam_tinbodyadditionaldata.csv':       ReadingID  PatientID     MeasureDate  ObesityDegree   BCM   BMR   BMC  \
 0        466416     267981  20221107100731            111  24.8  1195  2.27   
 1        466421     270215  20221107100823            118  33.4  1493  3.11   
 2        466426     225340  20221107101952            110  32.3  1447  2.93   
 3        466431     270211  20221107102248             93  27.8  1297  2.60   
 4        466436     266891  20221107104024            146  51.2  2062  4.57   
 ...         ...        ...             ...            ...   ...   ...   ...   
 9285     515706     312921  20240516182258            105  24.5  1190  2.28   
 9286     515711     312871  20240516184605            134  28.7  1332  2.66   
 9287     515716     312870  20240516184958            122  36.9  1594  3.11   
 9288     515721     309901  20240516185315            107  25.3  1216  2.36   
 9289     515726     312920  20240516190402            124  27.6  1292  2.53   
 
 

In [9]:
vital_df = gangnam.dataDict['gangnam_tpatientvitaltemp.csv']

In [11]:
vital_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 107119 entries, 0 to 107118
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   VitalID      107119 non-null  int64  
 1   PatientID    107119 non-null  int64  
 2   CheckDate    107119 non-null  int64  
 3   Code         97882 non-null   float64
 4   VitalValue   107119 non-null  float64
 5   CrTime       107119 non-null  int64  
 6   VitalValue2  9237 non-null    object 
 7   VitalID_ORG  0 non-null       float64
dtypes: float64(3), int64(4), object(1)
memory usage: 7.4+ MB


In [10]:
li1 = vital_df['PatientID']

In [11]:
len(li1), len(set(li1))

(107119, 6021)

In [63]:
vital_df.head(12)

Unnamed: 0,VitalID,PatientID,CheckDate,Code,VitalValue,CrTime,VitalValue2,VitalID_ORG
0,4657851,270215,202211070005,3.0,117.0,20221107100623,,
1,4657856,270215,202211070005,4.0,76.0,20221107100623,,
2,4657861,270215,202211070005,6.0,101.0,20221107100623,,
3,4657866,270215,202211070005,1.0,0.0,20221107100623,,
4,4657871,270215,202211070005,2.0,0.0,20221107100623,,
5,4657876,270215,202211070005,11.0,0.0,20221107100623,,
6,4657881,270215,202211070005,15.0,0.0,20221107100623,,
7,4657886,270215,202211070005,5.0,0.0,20221107100623,,
8,4657891,270215,202211070005,9.0,0.0,20221107100623,,
9,4657896,270215,202211070005,8.0,0.0,20221107100623,,


In [13]:
vital_df.sort_values(by = 'PatientID')

Unnamed: 0,VitalID,PatientID,CheckDate,Code,VitalValue,CrTime,VitalValue2,VitalID_ORG
57081,4938041,0,202309111840,,0.0,20230911185023,1:151.2|2:53.9|5:23.6|7:32.8|9:35.4|11:18.4|12...,
87897,5090471,0,202401231220,,0.0,20240123122620,1:165.3|2:47.8|5:17.5|7:33.3|9:26.0|11:18.8|12...,
69607,5000381,0,202310281150,,0.0,20231028115528,1:163.8|2:60.0|5:22.4|7:40.6|9:28.1|11:23.3|12...,
101226,5156306,0,202404111550,,0.0,20240411155547,1:160.0|2:71.5|5:27.9|7:36.1|9:46.2|11:20.2|12...,
40326,4855866,0,202306241030,,0.0,20230624103255,1:167.6|2:71.2|5:25.3|7:45.5|9:32.1|11:26.5|12...,
...,...,...,...,...,...,...,...,...
107050,5185175,312921,202405160020,11.0,0.0,20240516182538,,
107051,5185180,312921,202405160020,15.0,0.0,20240516182538,,
107052,5185185,312921,202405160020,5.0,0.0,20240516182538,,
107053,5185190,312921,202405160020,9.0,0.0,20240516182538,,


In [15]:
vital_df.groupby('PatientID').size()

PatientID
0         37
8         11
271       11
447       32
451       22
          ..
312870    11
312871    11
312891    11
312920    11
312921    11
Length: 6021, dtype: int64

In [29]:
vv1_df = vital_df.dropna(subset = ['VitalValue1'])

In [33]:
vv2_list = list(vv2_df['VitalValue2'])

In [36]:
vv2_sample = vv2_list[0]
vv2_sample

'1:159.5|2:59.4|5:23.3|7:35.9|9:35.7|11:20.6|12:21.2|14:0.382|15:21.2'

In [58]:
sample_list = vv2_sample.split('|')
sample_list

['1:159.5',
 '2:59.4',
 '5:23.3',
 '7:35.9',
 '9:35.7',
 '11:20.6',
 '12:21.2',
 '14:0.382',
 '15:21.2']

In [61]:
{pair.split(':')[0] :float(pair.split(':')[1]) for pair in sample_list}

{'1': 159.5,
 '2': 59.4,
 '5': 23.3,
 '7': 35.9,
 '9': 35.7,
 '11': 20.6,
 '12': 21.2,
 '14': 0.382,
 '15': 21.2}