In [1]:
import numpy as np
import pandas as pd
import os 
import sys
import time
import matplotlib.pyplot as plt
from DataPreprocess import DataPreprocess

class makeBasis:
    def __init__(self):
        self.path = os.getcwd() 
        self.region_list = [ 'bundang', 'gangnam', 'hongdae', 'jamsil','bucheon', 'busan','incheon']
        self.path2 =os.path.abspath(os.path.join(self.path, "..", ".."))
        self.newPath = os.path.join(f'{self.path2}\\MergeRegionData')

    def make_dir(self):
        os.path.abspath(os.path.join(self.path, "..", ".."))
        if not os.path.exists(self.newPath):
            os.makedirs(os.path.join(f'{self.path2}\\MergeRegionData'))
        else:
            print('이미 존재합니다.')

    def del_dir(self):
        self.newPath = os.path.join(f'{self.path2}\\MergeRegionData')
        # 'PreprocessData' 폴더가 존재하는지 확인하고 삭제
        if os.path.exists(self.newPath) and os.path.isdir(self.newPath):
            try:
                shutil.rmtree(self.newPath)
                print(f"'{self.newPath}' 폴더가 성공적으로 삭제되었습니다.")
            except Exception as e:
                print(f"폴더를 삭제하는 중 오류가 발생했습니다: {e}")
        else:
            print(f"'{self.newPath}' 폴더가 존재하지 않습니다.")

    def data_set(self):

        self.preprocessPathList = []
        self.dataDict = {}
        for region in self.region_list:
            preprocessPath = os.path.join(f'{self.path2}\\region\\{region}\\PreprocessData\\{region}_merge.csv')
            self.preprocessPathList.append(preprocessPath)
            self.dataDict[region] = pd.read_csv(preprocessPath, encoding='utf_8', index_col=0)
            self.dataDict[region] = self.dataDict[region][~(self.dataDict[region]['ConsultTime'].isna())]

        self.fileList = self.preprocessPathList
        print('All files are added')
        return self.dataDict

    def append_region_col(self):

        for region in self.region_list:
            df = self.dataDict[region]
            df['Region'] = f'{region}'
        self.dataDict[region] = df

    def concat_data(self):
        for i, region in enumerate(self.region_list):
            df = self.dataDict[region]
            if i == 0:
                concat_df = df
            else:
                concat_df = pd.concat([concat_df, df],axis = 0, join = 'outer')
        self.concat_df = concat_df
        return self.concat_df

    def del_zero_NaN_col(self, df):
        
        # 모든 원소가 NaN이거나 0인 열 삭제
        original_columns = df.columns  # 원본 열 이름 기억
        
        df_cleaned = df.dropna(axis=1, how='all')  # NaN 값이 모두인 열 삭제
        deleted_columns = original_columns.difference(df_cleaned.columns)  # 삭제된 열 찾기
        
        df_cleaned = df_cleaned.loc[:, (df_cleaned != 0).any(axis=0)]  # 0 값이 모두인 열 삭제
        deleted_columns = deleted_columns.union(original_columns.difference(df_cleaned.columns))  # 추가 삭제된 열 찾기
               
        print("\n삭제된 열 이름:")
        print(deleted_columns)

        return df_cleaned

    # 데이터 병합 메소드
    def merge_rows(self, group): 
        result = {'PatientID': group['PatientID'].iloc[0]}
        for col in group.columns:
            if col != 'PatientID':
                # 열 이름에 접미사 추가
                for i, value in enumerate(group[col], start=1):
                    col_name = f"{col}_{i}"
                    result[col_name] = value
        return result

    #빅데이터 처리를 위한 최적화 과정 + 데이터 분할 과정 + 그룹화 과정
    def process_and_merge(self, df, batch_size=1000):

        merged_results = []
        unique_ids = df['PatientID'].unique()
        
        for start in range(0, len(unique_ids), batch_size):
            end = start + batch_size
            batch_ids = unique_ids[start:end]
            batch_df = df[df['PatientID'].isin(batch_ids)]
            
            merged_results.extend([self.merge_rows(group) for _, group in batch_df.groupby('PatientID')])
        
        return pd.DataFrame(merged_results)

    def split_and_flatten(self, df):
        
        #그룹 크기로 조
        group_sizes = df.groupby(['PatientID']).size()
        # 그룹 크기가 1,2~5,5~9.10~으로 나
        single_patient_ids = group_sizes[group_sizes == 1].index
        multiple_patient_ids1 = group_sizes[(group_sizes > 1) & (group_sizes <= 5)].index
        multiple_patient_ids2 = group_sizes[(group_sizes > 5) & (group_sizes <= 9)].index
        multiple_patient_ids3 = group_sizes[9 < group_sizes].index
        
        # 필터링된 데이터 프레임
        df_1 = df[df['PatientID'].isin(single_patient_ids)]
        df_5 = df[df['PatientID'].isin(multiple_patient_ids1)]
        df_10 = df[df['PatientID'].isin(multiple_patient_ids2)]
        df_50 = df[df['PatientID'].isin(multiple_patient_ids3)]

        result_df1 = df_1
        cols = result_df1.columns.tolist()
        new_cols = ['PatientID']+ [col + '_1' for col in cols[1:]]
        result_df1.columns = new_cols

        df_list = [df_5, df_10, df_50]
        df_name = ['result_df5', 'result_df10', 'result_df50']
        flatten_dict = {'result_df1': result_df1}
        for key, data in zip(df_name, df_list):
            flatten_dict[key] = self.process_and_merge(data)
            length = max(data.groupby('PatientID').size()) + 1
            columns = flatten_dict[key].columns
            new_columns = ['PatientID']
            if length >9:
                for num in range(1, 10):
                    imsi_list = [column for column in columns if column[-2:] == '_'+str(num)]
                    #print(imsi_list)
                    new_columns = new_columns + imsi_list
                for num in range(10, length):
                    imsi_list = [column for column in columns if column[-2:] == str(num)]
                    new_columns = new_columns + imsi_list
            else:
                for num in range(1, length):
                    imsi_list = [column for column in columns if column[-2:] == '_'+str(num)]
                    new_columns = new_columns + imsi_list

            flatten_dict[key] = flatten_dict[key][new_columns]
            print(f'{key} 변환 완료')

        self.flatten_dict = flatten_dict
            
        byPatientID = \
        pd.concat([result_df1, flatten_dict['result_df5'], flatten_dict['result_df10'], flatten_dict['result_df50']], axis = 0, join ='outer')
        return byPatientID

        #후처리
        #self.byPatientID = byPatientID.drop_duplicates(subset=['PatientID'], keep='first')

    def save_csv(self, df, name):
        df.to_csv(f'{self.newPath}\\{name}.csv',index = False)

    def load_byPatientID(self):
        self.byPatientID = pd.read_csv(f'{self.newPath}\\byPatientID.csv', encoding ='utf-8')

    def load_basis_all(self):
        self.basis_all = pd.read_csv(f'{self.newPath}\\basis_all.csv', encoding ='utf-8')

    def make_personal(self):
        drop_list = ['PatientCellphone', 'PatientPhone','PatientCellphone2','PatientPhone2','GuardianNoID','Guardian', 'GuardianCellphone',
             'GuardianPhone', 'PatientEmail', 'PatientWeb','PatientMajAreaCC','PatientJob','PatientZipCode2', 'PatientAddr21', 'PatientAddr22',
             'PatientZipCode3', 'PatientAddr31', 'PatientAddr32','PatientNickName','PatientIntroRel','Discarded', 'UnpaidAmt','LunarBirthday',
             'PhoneMemo','PhoneMemo2', 'FtFlag', 'Nation', 'PatientID_ORG', 'Guest','Encrypted']
        for region in region_list:
            personal_df = pd.read_csv(f'{path}\\personal_data\\{region}.csv', encoding ='utf-8')
            personal_df = personal_df.drop(columns=drop_list)
            personal_df['Region_1'] = region
        
            personal_dict[region] = personal_df
        for region in region_list:
            df = personal_dict[region]
            if region == region_list[0]:
                concat_df = df
            else:
                concat_df = pd.concat([concat_df,df])
        self.personal_df = concat_df
        return self.personal_df


In [2]:
basis = makeBasis()
basis.load_basis_all()

  self.basis_all = pd.read_csv(f'{self.newPath}\\basis_all.csv', encoding ='utf-8')


In [4]:
import copy

In [5]:
df = copy.deepcopy(basis.basis_all)

In [9]:
df['PatientJN'] = df['PatientJN'].astype(str)

In [10]:
df['PatientJN']

0         7406052.0
1         9409182.0
2         9305292.0
3         7809022.0
4         5710032.0
            ...    
130075    8102162.0
130076    7505012.0
130077    9102192.0
130078    8501061.0
130079    9501291.0
Name: PatientJN, Length: 130080, dtype: object

In [13]:
def make_birthday(x):
    li = x[:-3]
    if len(li) == 6:
        if int(li[0]) >2:
            li = '19'+ li
        else:
            li = '20' + li
    elif len(li) == 5:
        li = '200'+li
    elif len(li) == 4:
        li = '2000'+ li
    elif len(li) == 3:
        li = '20000'+ li
    return li

In [14]:
df['PatientJN']=df['PatientJN'].apply(make_birthday)

In [15]:
df['PatientJN']

0         19740605
1         19940918
2         19930529
3         19780902
4         19571003
            ...   
130075    19810216
130076    19750501
130077    19910219
130078    19850106
130079    19950129
Name: PatientJN, Length: 130080, dtype: object

In [16]:
df['PatientJN'] = pd.to_datetime(df['PatientJN'], format='%Y%m%d', errors='coerce')

In [17]:
df['PatientFirstDate'] = pd.to_datetime(df['PatientFirstDate'], format='%Y%m%d', errors='coerce')

In [18]:
df['Age'] = (df['PatientFirstDate'] - df['PatientJN'])

In [19]:
from datetime import timedelta
def convert_to_years(td):
    if pd.isnull(td):
        return pd.NA
    else:
        years = td.days / 365.25
        return int(years)

In [20]:
df['Age'] = df['Age'].apply(convert_to_years)

In [21]:
info_df = df[['Region_1','PatientID','PatientChartNo','PatientAddr11','PatientFirstDate','Age','PatientSex','Description']]

In [22]:
info_df = info_df.rename(columns = {'Region_1':'Region'})

In [23]:
col_need = ['Date', 'ProgressNote', 'MedicineName','Memo','Weight','SMM','BodyFatMass','BMI','MaxVital','MinVital','Pulse','PBF','SoftLeanMass',
            'Height','InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']

In [24]:
df_dict= {}
for i in range(1,28):
    col_need_i = [col+'_'+str(i) for col in col_need]
    #print(col_need_i)
    df_dict[i] = df[col_need_i]

In [25]:
col_need2 = ['Date', 'ProgressNote', 'MedicineName','Memo','체중','골격근량','체지방량','BMI','혈압(고)','혈압(저)','맥박수','체지방률','근육량',
            '키','InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']

In [30]:
for i in range(1,28):
    df_i = df_dict[i]
    df_i = df_i.set_axis(col_need2, axis = 1)
    if i == 1:
        basis = df_i
    else:
        basis = pd.concat([basis, df_i], axis = 1)
    print(i, basis.shape)
basis

1 (130080, 30)
2 (130080, 60)
3 (130080, 90)
4 (130080, 120)
5 (130080, 150)
6 (130080, 180)
7 (130080, 210)
8 (130080, 240)
9 (130080, 270)
10 (130080, 300)
11 (130080, 330)
12 (130080, 360)
13 (130080, 390)
14 (130080, 420)
15 (130080, 450)
16 (130080, 480)
17 (130080, 510)
18 (130080, 540)
19 (130080, 570)
20 (130080, 600)
21 (130080, 630)
22 (130080, 660)
23 (130080, 690)
24 (130080, 720)
25 (130080, 750)
26 (130080, 780)
27 (130080, 810)


Unnamed: 0,Date,ProgressNote,MedicineName,Memo,체중,골격근량,체지방량,BMI,혈압(고),혈압(저),...,Osseus,ECW_TBW,ECF_TBF,VFA,WHR,WeightControl,FatControl,MuscleControl,BMR,FitnessScore
0,2022-11-25,"#비만\n\n신청 약 : 감비정 - D2X 405, 팻아웃 3, 부스터정 1\n차수...","['GambiTab-D2X_1통*3#405', '팻아웃 정1통*3 ', '부스터 정...","['2-1', '1', '1']",,,,,,,...,,,,,,,,,,
1,2022-11-24,,,,,,,,,,...,,,,,,,,,,
2,2022-11-23,,,,,,,,,,...,,,,,,,,,,
3,2022-12-05,"#비만\n\n신청 약 : 감비정M 405, 팻아웃플러스 1\n차수 : 2-2\n\n...","['GambiTab-M_1통*3#405', '팻아웃플러스1통*1']","['2-2', '1']",,,,,,,...,,,,,,,,,,
4,2023-06-28,# 비만\n\n현재체중 : 68\n마지막 약 복용일자 : 1년이상 \n포만감&식사조...,"['GambiTab-D2X_1통*2#270', '팻아웃 정1통*1 ', '비움정...","['1-1', '1', '1']",,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130075,2024-05-14,#비만(전화상담접수)\n\n★마지막처방일 5년경과 의료진 확인사항 \n현재체중 : ...,"['GambiTab-DLv8_1통*2#270', '팻아웃 정1통*1', '하루비움정...","['1-1', '1', '1']",,,,,,,...,,,,,,,,,,
130076,2024-05-14,#비만(비대면초진)\n\n여자\n\nㆍ 키/체중/BMI:156 / 72 / 경도비만...,"['GambiTab-DLv7_1통*3#405', 'GambiTab-DLv8_1통*3...","['2-1', '2-2', '1', '1']",,,,,,,...,,,,,,,,,,
130077,2024-05-14,#비만(내원초진)\n\n여자\n\nㆍ 키/체중/BMI:162 / 73 / 경도비만(...,,,78.0,25.5,31.1,29.7,133.0,85.0,...,,,,,,,,,,
130078,2024-05-14,#비만(내원초진)\n\n남자\n\nㆍ 키/체중/BMI:173 / 99.9 / 중도 ...,,,98.9,37.0,33.7,32.6,148.0,97.0,...,,,,,,,,,,


In [31]:
basis = pd.concat([info_df, basis], axis = 1)

In [32]:
B =makeBasis()

In [33]:
B.save_csv(basis, 'basis_all_22.11.07~24.05.16')

In [None]:
B.save_csv(basis, 'basis_for_filter')