In [4]:
import numpy as np
import pandas as pd
import os 
import sys
import time
import matplotlib.pyplot as plt
from DataPreprocess import DataPreprocess

class makeBasis:
    def __init__(self):
        self.path = os.getcwd() 
        self.region_list = [ 'bundang', 'gangnam', 'hongdae', 'jamsil','bucheon', 'busan','incheon']
        self.path2 =os.path.abspath(os.path.join(self.path, "..", ".."))
        self.newPath = os.path.join(f'{self.path2}\\MergeRegionData')

    def make_dir(self):
        os.path.abspath(os.path.join(self.path, "..", ".."))
        self.newPath = os.path.join(f'{self.path2}\\MergeRegionData')
        if not os.path.exists(self.newPath):
            os.makedirs(os.path.join(f'{self.path2}\\MergeRegionData'))
        else:
            print('이미 존재합니다.')

    def del_dir(self):
        self.newPath = os.path.join(f'{self.path2}\\MergeRegionData')
        # 'PreprocessData' 폴더가 존재하는지 확인하고 삭제
        if os.path.exists(self.newPath) and os.path.isdir(self.newPath):
            try:
                shutil.rmtree(self.newPath)
                print(f"'{self.newPath}' 폴더가 성공적으로 삭제되었습니다.")
            except Exception as e:
                print(f"폴더를 삭제하는 중 오류가 발생했습니다: {e}")
        else:
            print(f"'{self.newPath}' 폴더가 존재하지 않습니다.")

    def data_set(self):

        self.preprocessPathList = []
        self.dataDict = {}
        for region in self.region_list:
            preprocessPath = os.path.join(f'{self.path2}\\region\\{region}\\PreprocessData\\{region}_merge.csv')
            self.preprocessPathList.append(preprocessPath)
            self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)
            self.dataDict[region] = self.dataDict[region][~(self.dataDict[region]['ConsultTime'].isna())]

        self.fileList = self.preprocessPathList
        print('All files are added')
        return self.dataDict

    def append_region_col(self):

        for region in self.region_list:
            df = self.dataDict[region]
            df['Region'] = f'{region}'
        self.dataDict[region] = df

    def concat_data(self):
        for i, region in enumerate(self.region_list):
            df = self.dataDict[region]
            if i == 0:
                concat_df = df
            else:
                concat_df = pd.concat([concat_df, df],axis = 0, join = 'outer')
        self.concat_df = concat_df
        return self.concat_df

    def del_zero_NaN_col(self, df):
        
        # 모든 원소가 NaN이거나 0인 열 삭제
        original_columns = df.columns  # 원본 열 이름 기억
        
        df_cleaned = df.dropna(axis=1, how='all')  # NaN 값이 모두인 열 삭제
        deleted_columns = original_columns.difference(df_cleaned.columns)  # 삭제된 열 찾기
        
        df_cleaned = df_cleaned.loc[:, (df_cleaned != 0).any(axis=0)]  # 0 값이 모두인 열 삭제
        deleted_columns = deleted_columns.union(original_columns.difference(df_cleaned.columns))  # 추가 삭제된 열 찾기
               
        print("\n삭제된 열 이름:")
        print(deleted_columns)

        return df_cleaned

    # 데이터 병합 메소드
    def merge_rows(self, group): 
        result = {'PatientID': group['PatientID'].iloc[0]}
        for col in group.columns:
            if col != 'PatientID':
                # 열 이름에 접미사 추가
                for i, value in enumerate(group[col], start=1):
                    col_name = f"{col}_{i}"
                    result[col_name] = value
        return result

    #빅데이터 처리를 위한 최적화 과정 + 데이터 분할 과정 + 그룹화 과정
    def process_and_merge(self, df, batch_size=1000):

        merged_results = []
        unique_ids = df['PatientID'].unique()
        
        for start in range(0, len(unique_ids), batch_size):
            end = start + batch_size
            batch_ids = unique_ids[start:end]
            batch_df = df[df['PatientID'].isin(batch_ids)]
            
            merged_results.extend([self.merge_rows(group) for _, group in batch_df.groupby('PatientID')])
        
        return pd.DataFrame(merged_results)

    def split_and_flatten(self, df):
        
        #그룹 크기로 조
        group_sizes = df.groupby(['PatientID']).size()
        # 그룹 크기가 1,2~5,5~9.10~으로 나
        single_patient_ids = group_sizes[group_sizes == 1].index
        multiple_patient_ids1 = group_sizes[(group_sizes > 1) & (group_sizes <= 5)].index
        multiple_patient_ids2 = group_sizes[(group_sizes > 5) & (group_sizes <= 9)].index
        multiple_patient_ids3 = group_sizes[9 < group_sizes].index
        
        # 필터링된 데이터 프레임
        df_1 = df[df['PatientID'].isin(single_patient_ids)]
        df_5 = df[df['PatientID'].isin(multiple_patient_ids1)]
        df_10 = df[df['PatientID'].isin(multiple_patient_ids2)]
        df_50 = df[df['PatientID'].isin(multiple_patient_ids3)]

        result_df1 = df_1
        cols = result_df1.columns.tolist()
        new_cols = ['PatientID']+ [col + '_1' for col in cols[1:]]
        result_df1.columns = new_cols

        df_list = [df_5, df_10, df_50]
        df_name = ['result_df5', 'result_df10', 'result_df50']
        flatten_dict = {'result_df1': result_df1}
        for key, data in zip(df_name, df_list):
            flatten_dict[key] = self.process_and_merge(data)
            length = max(data.groupby('PatientID').size()) + 1
            columns = flatten_dict[key].columns
            new_columns = ['PatientID']
            if length >9:
                for num in range(1, 10):
                    imsi_list = [column for column in columns if column[-2:] == '_'+str(num)]
                    #print(imsi_list)
                    new_columns = new_columns + imsi_list
                for num in range(10, length):
                    imsi_list = [column for column in columns if column[-2:] == str(num)]
                    new_columns = new_columns + imsi_list
            else:
                for num in range(1, length):
                    imsi_list = [column for column in columns if column[-2:] == '_'+str(num)]
                    new_columns = new_columns + imsi_list

            flatten_dict[key] = flatten_dict[key][new_columns]
            print(f'{key} 변환 완료')

        self.flatten_dict = flatten_dict
            
        byPatientID = \
        pd.concat([result_df1, flatten_dict['result_df5'], flatten_dict['result_df10'], flatten_dict['result_df50']], axis = 0, join ='outer')
        return byPatientID

        #후처리
        #self.byPatientID = byPatientID.drop_duplicates(subset=['PatientID'], keep='first')

    def save_csv(self, df, name):
        df.to_csv(f'{self.newPath}\\{name}.csv',index = False)

    def load_csv(self):
        self.byPatientID = pd.read_csv(f'{self.newPath}\\byPatientID.csv', encoding ='utf-8')


    def make_personal(self):
        drop_list = ['PatientCellphone', 'PatientPhone','PatientCellphone2','PatientPhone2','GuardianNoID','Guardian', 'GuardianCellphone',
             'GuardianPhone', 'PatientEmail', 'PatientWeb','PatientMajAreaCC','PatientJob','PatientZipCode2', 'PatientAddr21', 'PatientAddr22',
             'PatientZipCode3', 'PatientAddr31', 'PatientAddr32','PatientNickName','PatientIntroRel','Discarded', 'UnpaidAmt','LunarBirthday',
             'PhoneMemo','PhoneMemo2', 'FtFlag', 'Nation', 'PatientID_ORG', 'Guest','Encrypted']
        personal_dict ={}
        for region in self.region_list:
            personal_df = pd.read_csv(f'{self.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
            personal_df = personal_df.drop(columns=drop_list)
            personal_df['Region_1'] = region
            
            personal_dict[region] = personal_df
        
        for region in self.region_list:
            df = personal_dict[region]
            if region == self.region_list[0]:
                concat_df = df
            else:
                concat_df = pd.concat([concat_df,df])
        self.personal_df = concat_df
        return self.personal_df

    def merge_personal(self,all_personal_df, byPatientID):
        df = pd.merge(all_personal_df, byPatientID, on = ['PatientID','Region_1'], how = 'inner')
        return df

    def make_age(self, df):
        df['PatientJN'] = df['PatientJN'].astype(str)
        df['PatientFirstDate'] = df['PatientFirstDate'].astype(str)
        def make_birthday(x):
            li = x[:-3]
            if len(li) == 6:
                if int(li[0]) >2:
                    li = '19'+ li
                else:
                    li = '20' + li
            elif len(li) == 5:
                li = '200'+li
            elif len(li) == 4:
                li = '2000'+ li
            elif len(li) == 3:
                li = '20000'+ li
            return li
        df['PatientJN']=df['PatientJN'].apply(make_birthday)
        df['PatientJN'] = pd.to_datetime(df['PatientJN'], format='%Y%m%d', errors='coerce')
        df['PatientFirstDate'] = pd.to_datetime(df['PatientFirstDate'], format='%Y%m%d', errors='coerce')
        from datetime import timedelta
        def convert_to_years(td):
            if pd.isnull(td):
                return pd.NA
            else:
                years = td.days / 365.25
                return int(years)
        df['Age'] = (df['PatientFirstDate'] - df['PatientJN'])
        df['Age'] = df['Age'].apply(convert_to_years)
        return df

    def columns_extraction(self, df):
        info_df = df[['Region_1','PatientID','PatientChartNo','PatientName','PatientAddr11','PatientFirstDate','Age','PatientSex','Description']]
        info_df = info_df.rename(columns = {'Region_1':'Region'})
        #필요한 컬럼들 불러오는 작
        col_need = ['Date', 'ProgressNote', 'MedicineName','Memo','Weight','SMM','BodyFatMass','BMI','MaxVital','MinVital','Pulse','PBF','SoftLeanMass',
            'Height','InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']
        df_dict= {}
        for i in range(1,28):
            col_need_i = [col+'_'+str(i) for col in col_need]
            #print(col_need_i)
            df_dict[i] = df[col_need_i]
        #basisAll 원래 컬럼 이름으로 맞추는 작업
        col_need2 = ['Date', 'ProgressNote', 'MedicineName','Memo','체중','골격근량','체지방량','BMI','혈압(고)','혈압(저)','맥박수','체지방률','근육량',
            '키','InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']
        for i in range(1,28):
            df_i = df_dict[i]
            df_i = df_i.set_axis(col_need2, axis = 1)
            if i == 1:
                basis = df_i
            else:
                basis = pd.concat([basis, df_i], axis = 1)
        basis = pd.concat([info_df, basis], axis = 1)
        return basis
   
    def filter_rename(self, df):
        info_df = df[['Region','PatientID','PatientChartNo','PatientName','PatientAddr11','PatientFirstDate','Age','PatientSex','Description']]
        info_df = info_df.rename(columns = {'Region_1':'Region'})
        #필요한 컬럼들 불러오는 작
        col_need = ['Date', 'ProgressNote', 'MedicineName','Memo','Weight','SMM','BodyFatMass','BMI','MaxVital','MinVital','Pulse','PBF','SoftLeanMass',
            'Height','InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']
        df_dict= {}
        for i in range(1,28):
            col_need_i = [col+'_'+str(i) for col in col_need]
            #print(col_need_i)
            df_dict[i] = df[col_need_i]
        #basisAll 원래 컬럼 이름으로 맞추는 작업
        col_need2 = ['Date', 'ProgressNote', 'MedicineName','Memo','체중','골격근량','체지방량','BMI','혈압(고)','혈압(저)','맥박수','체지방률','근육량',
            '키','InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']
        for i in range(1,28):
            df_i = df_dict[i]
            df_i = df_i.set_axis(col_need2, axis = 1)
            if i == 1:
                basis = df_i
            else:
                basis = pd.concat([basis, df_i], axis = 1)
        basis = pd.concat([info_df, basis], axis = 1)
        return basis
        
    def make_basis_for_filter(self, df):
        info_df = df[['Region_1','PatientID','PatientChartNo','PatientName','PatientAddr11','PatientFirstDate','Age','PatientSex','Description']]
        info_df = info_df.rename(columns = {'Region_1':'Region'})
        #필요한 컬럼들 불러오는 작
        col_need = ['Date', 'ProgressNote', 'MedicineName','Memo','Weight','SMM','BodyFatMass','BMI','MaxVital','MinVital','Pulse','PBF','SoftLeanMass',
            'Height','InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']
        df_dict= {}
        for i in range(1,28):
            col_need_i = [col+'_'+str(i) for col in col_need]
            #print(col_need_i)
            df_dict[i] = df[col_need_i]
        for i in range(1,28):
            df_i = df_dict[i]
            if i == 1:
                basis = df_i
            else:
                basis = pd.concat([basis, df_i], axis = 1)
        basis = pd.concat([info_df, basis], axis = 1)
        return basis

    def change_MedicineName(self, basis):
        MedicineName_list = [col for col in basis.columns.tolist() if 'MedicineName' in col]
        Memo_list = [col for col in basis.columns.tolist() if 'Memo' in col]
        nan = ''
        for i in range(1,len(MedicineName_list)):
            print(MedicineName_list[i-1])
            #print(i,basis[Memo_list[i-1]])
            for k, med_list in enumerate(basis[MedicineName_list[i-1]]):
                new_med=''
                new_memo=''
                if pd.isna(med_list):
                    continue
                if ''== med_list:
                    continue

                for j, med in enumerate(eval(med_list)): # 감비와 비움만을 남기고 양식에서 대괄호와 ''을 지움
                    if 'Gambi' in med:
                        if new_med == '' :
                            new_med = new_med + med
                            new_memo = new_memo + eval(basis.loc[k,Memo_list[i-1]])[j]
        
                        else:
                            new_med = new_med +', ' + med
                            new_memo = new_memo +', '+ eval(basis.loc[k,Memo_list[i-1]])[j]
        
                    elif '비움' in med:
                        if new_med == '' :
                            new_med = new_med + med
                            new_memo = new_memo + eval(basis.loc[k,Memo_list[i-1]])[j]
        
                        else:
                            new_med = new_med +', ' + med
                            new_memo = new_memo +', '+ eval(basis.loc[k,Memo_list[i-1]])[j]
                        
                basis.loc[k,MedicineName_list[i-1]] = new_med
                basis.loc[k,Memo_list[i-1]] = new_memo
            #print(i,basis[Memo_list[i-1]])
        return basis

        
    

In [5]:
basis = makeBasis()

In [6]:
basis.make_dir()
basis.data_set()
basis.append_region_col()

이미 존재합니다.


  self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)
  self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)
  self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)
  self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)
  self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)
  self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)


All files are added


  self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)


In [7]:
for region in basis.region_list:

    print(region)
    basis.dataDict[region] = basis.split_and_flatten(basis.dataDict[region])

basis.concat_data()
byPatientID = basis.del_zero_NaN_col(basis.concat_df)

bundang
result_df5 변환 완료
result_df10 변환 완료
result_df50 변환 완료
gangnam
result_df5 변환 완료
result_df10 변환 완료
result_df50 변환 완료
hongdae
result_df5 변환 완료
result_df10 변환 완료
result_df50 변환 완료
jamsil
result_df5 변환 완료
result_df10 변환 완료
result_df50 변환 완료
bucheon
result_df5 변환 완료
result_df10 변환 완료
result_df50 변환 완료
busan
result_df5 변환 완료
result_df10 변환 완료
result_df50 변환 완료
incheon
result_df5 변환 완료
result_df10 변환 완료
result_df50 변환 완료

삭제된 열 이름:
Index(['1000_LA_28', '1000_LA_32', '1000_LA_33', '1000_LA_34', '1000_LA_35',
       '1000_LA_36', '1000_LA_37', '1000_LA_38', '1000_LA_39', '1000_LA_40',
       ...
       'neck_Muscle_39', 'neck_Muscle_40', 'neck_Muscle_41', 'neck_Muscle_42',
       'neck_Muscle_43', 'neck_Muscle_44', 'neck_Muscle_45', 'neck_Muscle_46',
       'neck_Muscle_47', 'neck_Muscle_48'],
      dtype='object', length=4086)


In [5]:
per_df = basis.make_personal()

  personal_df = pd.read_csv(f'{self.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{self.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{self.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{self.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{self.path2}\\personal_data\\{region}.csv', encoding ='utf-8')


In [6]:
df = basis.merge_personal(per_df, byPatientID)

In [7]:
age_df = basis.make_age(df)

In [8]:
basis.save_csv(df,'22.11.07~24.05.16_raw_basis')

In [55]:
filtter_df= basis.make_basis_for_filter(age_df)

In [23]:
import copy
imsi = copy.deepcopy(df)

In [24]:
df1 = basis.change_MedicineName(imsi)

MedicineName_1
1 0                ['2-1', '1', '1']
1                              NaN
2                              NaN
3                     ['2-2', '1']
4                ['1-1', '1', '1']
                    ...           
130075           ['1-1', '1', '1']
130076    ['2-1', '2-2', '1', '1']
130077                         NaN
130078                         NaN
130079                         NaN
Name: Memo_1, Length: 130080, dtype: object
MedicineName_2
2 0                     ['2-2', '1']
1                ['1', '2-1', '1']
2         ['2-1,2', '1', '1', '1']
3                              NaN
4                              NaN
                    ...           
130075                         NaN
130076                         NaN
130077                         NaN
130078                         NaN
130079                         NaN
Name: Memo_2, Length: 130080, dtype: object
MedicineName_3
3 0         ['2-1,2', '1', '1']
1                ['2-2', '1']
2                       ['1']
3

In [56]:
basis.save_csv(df,'basis_for_filter')

In [43]:
df = basis.filter_rename(imsi)

In [47]:
basis.save_csv(df,'22.11.07~24.05.16_basisAll')

In [12]:
drop_list = ['PatientCellphone', 'PatientPhone','PatientCellphone2','PatientPhone2','GuardianNoID','Guardian',
 'GuardianCellphone',
 'GuardianPhone',
 'PatientEmail',
 'PatientWeb','PatientMajAreaCC','PatientJob','PatientZipCode2',
 'PatientAddr21',
 'PatientAddr22',
 'PatientZipCode3',
 'PatientAddr31',
 'PatientAddr32','PatientNickName','PatientIntroRel','Discarded',
 'UnpaidAmt','LunarBirthday','PhoneMemo',
 'PhoneMemo2',
 'FtFlag',
 'Nation',
 'PatientID_ORG',
 'Guest','Encrypted']

In [10]:
personal_dict ={}
for region in basis.region_list:
    personal_df = pd.read_csv(f'{basis.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
    personal_df = personal_df.drop(columns=drop_list)
    personal_df['Region_1'] = region

    personal_dict[region] = personal_df

  personal_df = pd.read_csv(f'{basis.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{basis.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{basis.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{basis.path2}\\personal_data\\{region}.csv', encoding ='utf-8')
  personal_df = pd.read_csv(f'{basis.path2}\\personal_data\\{region}.csv', encoding ='utf-8')


In [13]:
for region in basis.region_list:
    df = personal_dict[region]
    if region == basis.region_list[0]:
        concat_df = df
    else:
        concat_df = pd.concat([concat_df,df])
all_personal_df = concat_df

In [15]:
df = pd.merge(all_personal_df, byPatientID, on = ['PatientID','Region_1'], how = 'inner')

In [18]:
df.to_csv(f'{basis.path2}\\MergeRegionData\\basis_all.csv', encoding = 'utf-8')