In [262]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import os
import copy

class BasisFilter:
    
    def __init__(self):
        self.path = os.getcwd() 
        self.path =os.path.abspath(os.path.join(self.path, "..", ".."))
        self.newPath = os.path.join(f'{self.path}\\filtered_data')
        self.region_list = ['bundang', 'gangnam', 'hongdae', 'jamsil', 'bucheon', 'busan', 'incheon']
        self.index_level_2= ['Date', 'ProgressNote', 'MedicineName','Memo','체중','골격근량','체지방량','BMI','혈압(고)','혈압(저)','맥박수','체지방률','Height','근육량',
            'InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']

    def make_dir(self):
        if not os.path.exists(self.newPath):
            os.makedirs(os.path.join(f'{self.path}\\filtered_data'))
        else:
            print('이미 존재합니다.')

    def del_dir(self):
        # 'PreprocessData' 폴더가 존재하는지 확인하고 삭제
        if os.path.exists(self.newPath) and os.path.isdir(self.newPath):
            try:
                shutil.rmtree(self.newPath)
                print(f"'{self.newPath}' 폴더가 성공적으로 삭제되었습니다.")
            except Exception as e:
                print(f"폴더를 삭제하는 중 오류가 발생했습니다: {e}")
        else:
            print(f"'{self.newPath}' 폴더가 존재하지 않습니다.")

    def load_basis(self):
        self.basis = pd.read_csv(f'{self.path}\\MergeRegionData\\basis_for_filter.csv', encoding='utf-8')
        # date_columns = ['PatientFirstDate'] + ['Date'] + ['Date.'+str(i) for i in range(1,28)]
        # for col in date_columns:
        #     self.basis[col] = pd.to_datetime(self.basis[col],format='%Y%m%d')
        return self.basis

    def first_visit_filter(self, df):
        
        pattern = r'\[1\]\s*기본\s*상담*'
        # 각 ProgressNote 열에서 두 가지 패턴을 모두 만족하는 행 필터링
        filtered_df = df[
            df['ProgressNote_1'].str.contains(pattern, regex=True) |
            df['ProgressNote_2'].str.contains(pattern, regex=True) |
            df['ProgressNote_3'].str.contains(pattern, regex=True) |
            df['ProgressNote_4'].str.contains(pattern, regex=True)
        ]
        # 필터링된 행의 PatientID 추출
        result_patient_ids = filtered_df['PatientID'].tolist()
        filtered_df = filtered_df['2022-11-06'<filtered_df['PatientFirstDate']]
        return filtered_df
        
    def save_csv(self, df, name):
        df.to_csv(f'{self.newPath}\\{name}.csv')
        
    def save_excel(self, df, name):
        df.to_excel(f'{self.newPath}\\{name}.xlsx')

    def load_csv(self, name):
        df = pd.read_csv(f'{self.newPath}\\{name}.csv', encoding ='utf-8')
        return df

    def make_inbody2up_dict(self, df):
        idx_list = df.index.tolist()
        
        Weight_list = ['Weight_'+str(i) for i in range(1,28)]
        inbody2up_dict = {}
        for idx in idx_list:
            inbody_weights = pd.DataFrame(df.loc[idx,Weight_list].dropna()).transpose()
            Date_list = ['Date_' + weight_col.split('_')[-1] for weight_col in inbody_weights.columns.tolist()]
            inbody_dates = df.loc[idx,Date_list]
            if len(inbody_dates) > 1:
                inbody2up_dict[idx] = list(inbody_dates)
        self.inbody2up_idx = list(inbody2up_dict.keys())
        self.inbody2up_dict = inbody2up_dict
        return self.inbody2up_dict

    def make_return_dict(self, df, dict, day_range): #day range는 리스트형태로 [a, b]의 형태로 입력
        return_dict = {}
        for idx in dict.keys():
            for date in dict[idx][1:]:
                day = (datetime.strptime(date, '%Y-%m-%d') - datetime.strptime(dict[idx][0], '%Y-%m-%d')).days
                if day_range[0] < day < day_range[1]:
                    return_dict[idx] = date
        self.return_dict = return_dict
        self.return_idx = list(return_dict.keys())
        return self.return_dict

    def make_medicine_dict(self, df, inbody2up_dict, return_dict):
        #record_dict은 gambi에 대한 정보만을 저장
        #allmed는 같이 사용된 약의 정보까지 모두 저장
        self.record_dict ={}
        self.allmed_dict ={}
        for idx in return_dict.keys():
            return_day = return_dict[idx]
            return_day = datetime.strptime(return_day, '%Y-%m-%d')
            first_day = inbody2up_dict[idx][0]
            first_day = datetime.strptime(first_day, '%Y-%m-%d')
            target = 0
            record_list =[]
            med_list = []
            
            for i in range(1,32):
                med = f'MedicineName_{i}'
                date = f'Date_{i}'
                memo = f'Memo_{i}'
                if pd.isna(df.loc[idx,date]):
                    continue        
                day = datetime.strptime(df.loc[idx,date], '%Y-%m-%d')
                if return_day - timedelta(days=1) <= day:
                    break
                if first_day - timedelta(days=1) > day:
                    break
                if pd.isna(df.loc[idx,med]):
                    continue
                else:
                    for j, medicine in enumerate(eval(df.loc[idx,med])):
                        if 'Gambi' in medicine:
                            if 'Tab' in medicine:
                                memo_list =eval(df.loc[idx,memo])[j]
                                if memo_list =='2-1':
                                    record_list.append([day,medicine,'2-1'])
                                    target = target + 1
                                    med_list.append(eval(df.loc[idx,med]))
                                if memo_list =='2-2':
                                    record_list.append([day,medicine,'2-2'])
                                    target = target + 1
                                    med_list.append(eval(df.loc[idx,med]))
            if target > 1:
                #print(idx, target, return_day)
                self.record_dict[idx] = record_list
                self.allmed_dict[idx] = med_list

    def make_FR_df(self, df, medicine_idx, return_dict, inbody2up_dict):
        
        concat_df = pd.DataFrame()
        df_info = df[['Region','PatientChartNo','PatientAddr11','PatientFirstDate','Age','PatientSex','Description']]
        dates = ['Date_'+str(i) for i in range(1,28)]
        for idx in medicine_idx:
            #print(inbody2up_dict[idx][0],return_dict[idx])
            F = int(df[dates].loc[idx, df.loc[idx] == inbody2up_dict[idx][0]].index.tolist()[0].split('_')[-1])
            R = int(df[dates].loc[idx, df.loc[idx] == return_dict[idx]].index.tolist()[0].split('_')[-1])
            df_info_i = pd.DataFrame(df_info.loc[idx]).transpose()
            #print(df_info_i)
            Fs_idx = df.columns.get_loc('Date_'+str(F))
            Fl_idx = df.columns.get_loc('Date_'+str(F+1))
            Rs_idx = df.columns.get_loc('Date_'+str(R))
            Rl_idx = df.columns.get_loc('Date_'+str(R+1))
            F_df = pd.DataFrame(df.iloc[:, Fs_idx:Fl_idx].loc[idx]).transpose()
            F_col =F_df.columns.tolist()
            F_col =['_'.join(col.split('_')[:-1]+['F']) for col in F_col]
            F_df = F_df.set_axis(F_col, axis = 'columns')
            R_df = pd.DataFrame(df.iloc[:, Rs_idx:Rl_idx].loc[idx]).transpose()
            R_col =R_df.columns.tolist()
            R_col =['_'.join(col.split('_')[:-1]+['R']) for col in R_col]
            R_df = R_df.set_axis(R_col, axis = 'columns')
            FR_df = pd.concat([F_df,R_df], axis = 1)
            FR_df = pd.concat([df_info_i,FR_df], axis = 1)
        
            concat_df = pd.concat([concat_df,FR_df], axis = 0)
        self.FR_df = concat_df
        return self.FR_df
            
        


In [263]:
BF = BasisFilter()

In [193]:
basis = BF.load_basis()

  self.basis = pd.read_csv(f'{self.path}\\MergeRegionData\\basis_for_filter.csv', encoding='utf-8')


In [194]:
BF.basis

Unnamed: 0,Region,PatientID,PatientChartNo,PatientAddr11,PatientFirstDate,Age,PatientSex,Description,Date_1,ProgressNote_1,MedicineName_1,Memo_1,Weight_1,SMM_1,BodyFatMass_1,BMI_1,MaxVital_1,MinVital_1,Pulse_1,PBF_1,SoftLeanMass_1,Height_1,InterCellWater_1,ExtraCellWater_1,TotalBodyWater_1,...,SMM_27,BodyFatMass_27,BMI_27,MaxVital_27,MinVital_27,Pulse_27,PBF_27,SoftLeanMass_27,Height_27,InterCellWater_27,ExtraCellWater_27,TotalBodyWater_27,ProteinMass_27,MineralMass_27,FatFreeMass_27,Osseus_27,ECW_TBW_27,ECF_TBF_27,VFA_27,WHR_27,WeightControl_27,FatControl_27,MuscleControl_27,BMR_27,FitnessScore_27
0,bundang,8,14634,"경기도 성남시 수정구 산성대로437번길 7 (단대동, 푸르지오)",2014-06-30,40.0,2,,2022-11-25,"#비만\n\n신청 약 : 감비정 - D2X 405, 팻아웃 3, 부스터정 1\n차수...","['GambiTab-D2X_1통*3#405', '팻아웃 정1통*3 ', '부스터 정...","['2-1', '1', '1']",,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,bundang,15,25008,전라북도 군산시 하나운2길 15(나운동),2016-01-22,21.0,2,추천인 길갑인(7744)님.,2022-11-24,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130078,incheon,21840,3962.0,"인천광역시 남동구 석산로222번길 53 (구월동, 행복꿈터)",2024-05-14,39.0,1,,2024-05-14,#비만(내원초진)\n\n남자\n\nㆍ 키/체중/BMI:173 / 99.9 / 중도 ...,,,98.9,37.0,33.7,32.6,148.0,97.0,84.0,34.1,61.7,174.3,29.9,18.0,47.9,...,,,,,,,,,,,,,,,,,,,,,,,,,
130079,incheon,21841,3963.0,"인천광역시 연수구 선학로 37 (선학동, 대진아파트)",2024-05-14,29.0,1,비대면 ★빨간스티커,2024-05-14,# 비만(비대면초진)\n\n· 키/체중/BMI : 177 / 88 / 28.09\n...,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [275]:
first_df = BF.first_visit_filter(basis)
first_df.shape

(44829, 818)

In [196]:
inbody2up_dict = BF.make_inbody2up_dict(first_df)

In [197]:
inbody2up_df = first_df[first_df.index.isin(list(inbody2up_dict.keys()))]

In [276]:
len(inbody2up_dict)

9739

In [199]:
return_dict = BF.make_return_dict(inbody2up_df, inbody2up_dict, [70,98])

In [279]:
len(return_dict)

2103

In [266]:
BF.make_medicine_dict(first_df, inbody2up_dict, return_dict)

In [267]:
# 최대 10개의 행만 출력
pd.options.display.max_rows = 5
# 최대 10개의 열만 출력
pd.options.display.max_columns = 50

In [268]:
record_dict=BF.record_dict
med_idx = list(record_dict.keys())

In [269]:
filtered_df = first_df.iloc[first_df.index.isin(list(record_dict.keys()))]

In [270]:
BF.make_FR_df(first_df,med_idx,return_dict,inbody2up_dict)

Unnamed: 0,Region,PatientChartNo,PatientAddr11,PatientFirstDate,Age,PatientSex,Description,Date_F,ProgressNote_F,MedicineName_F,Memo_F,Weight_F,SMM_F,BodyFatMass_F,BMI_F,MaxVital_F,MinVital_F,Pulse_F,PBF_F,SoftLeanMass_F,Height_F,InterCellWater_F,ExtraCellWater_F,TotalBodyWater_F,ProteinMass_F,...,SMM_R,BodyFatMass_R,BMI_R,MaxVital_R,MinVital_R,Pulse_R,PBF_R,SoftLeanMass_R,Height_R,InterCellWater_R,ExtraCellWater_R,TotalBodyWater_R,ProteinMass_R,MineralMass_R,FatFreeMass_R,Osseus_R,ECW_TBW_R,ECF_TBF_R,VFA_R,WHR_R,WeightControl_R,FatControl_R,MuscleControl_R,BMR_R,FitnessScore_R
2720,bundang,118562,"경기도 성남시 분당구 판교로 393(삼평동, 봇들마을2단지이지더원아파트)",2023-03-25,60.0,2,,2023-03-25,#비만\n\n여자\n\nㆍ 키/체중/BMI:158 / 75 / 중도 비만(30.04...,"['GambiTab-D4_1통*3#405', '팻아웃 정1통*1 ']","['2-1', '1']",75.4,24.3,31.0,30.7,130.0,79.0,68.0,41.2,41.9,156.7,20.2,12.4,32.6,8.8,...,23.7,27.5,29.0,124.0,77.0,99.0,38.6,41.2,156.7,19.7,12.5,32.2,8.5,3.02,43.7,2.45,0.389,0.342,141.4,0.93,-14.4,-14.4,0.0,1314.0,70.0
8361,bundang,122116,"경기도 수원시 영통구 동탄원천로881번길 35 (매탄동, 주공그린빌)",2023-09-02,31.0,2,내원경로 이주희(강45909/ 핸7385),2023-09-02,#비만(내원초진)\n\n여자\n\nㆍ 키/체중/BMI:157 / 60 / 과체중(2...,"['GambiTab-D4_1통*3#405', 'GambiTab-D5_1통*3#405...","['2-1', '2-2', '1']",61.6,20.9,22.7,25.5,103.0,58.0,80.0,36.9,36.6,155.3,17.5,11.0,28.5,7.5,...,20.8,19.9,24.2,113.0,77.0,90.0,34.1,36.2,155.3,17.5,10.7,28.2,7.5,2.77,38.5,2.28,0.378,0.332,92.1,0.87,-7.7,-8.2,0.5,1202.0,71.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128976,incheon,2821.0,"충청남도 천안시 동남구 청당4로 60 (청당동, 청당동 한양수자인 블루시티)",2024-01-25,56.0,2,,2024-01-25,#비만(내원초진)\n\n여자\n\nㆍ 키/체중/BMI:150 / 64 / 경도비만(...,,,63.0,18.6,28.2,29.0,153.0,96.0,84.0,44.8,32.9,147.3,15.8,9.8,25.6,6.9,...,18.5,25.2,27.5,136.0,98.0,107.0,42.2,32.5,147.3,15.7,9.7,25.4,6.8,2.32,34.5,1.95,0.38,0.333,139.2,0.92,-14.1,-14.7,0.6,1116.0,65.0
129096,incheon,2947.0,"인천광역시 남동구 호구포로899번길 8 (간석동, 두진아파트)",2024-02-13,59.0,1,,2024-02-13,#비만(내원초진)\n\n남자\n\nㆍ 키/체중/BMI:170 / 78 / 경도비만(...,"['GambiTab-D5_1통*3#405', '팻아웃 정1통*2', '하루비움정1통...","['2-1', '1', '1']",79.1,29.5,26.0,27.7,156.0,103.0,89.0,32.9,50.3,168.9,24.1,15.1,39.2,10.4,...,29.4,20.6,25.7,142.0,90.0,87.0,28.1,49.8,168.9,24.0,14.8,38.8,10.4,3.53,52.7,2.87,0.38,0.333,90.4,0.93,-10.5,-11.2,0.7,1508.0,68.0


In [272]:
BF.save_excel(BF.FR_df,'RF_df')

In [226]:
info_df = filtered_df.iloc[:,:8]
df = filtered_df.iloc[:,8:]

In [230]:
cols = df.columns.tolist()

In [245]:
len(cols)

810

In [250]:
# 컬럼 이름 분리하여 리스트 생성
split_columns = [col.rsplit('_', 1) for col in cols]
# 숫자와 문자열을 각각 리스트로 분리
index_level_1 = [int(x[1]) for x in split_columns]
index_level_2= ['Date', 'ProgressNote', 'MedicineName','Memo','체중','골격근량','체지방량','BMI','혈압(고)','혈압(저)','맥박수','체지방률','Height','근육량',
            'InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']*27

In [253]:
multi_index = pd.MultiIndex.from_arrays([index_level_1, index_level_2], names=['Number', 'Description'])

In [254]:
df.columns = multi_index

In [255]:
info_df.columns = pd.MultiIndex.from_arrays([[0 for i in range(len(info_df.columns))], info_df.columns], names=['Number', 'Description'])

NotImplementedError: isna is not defined for MultiIndex

In [None]:
info_df

In [258]:
filtered_recol = pd.concat([info_df,df],axis=1)

In [273]:
BF.save_excel(filtered_recol,'22.11.07~24.05.16_대면초진_3개월경과_재측정_표본')

NameError: name 'filtered_recol' is not defined

In [310]:
FR_df=BF.FR_df
info_df = FR_df.iloc[:,:7]
df = FR_df.iloc[:,7:-2]
med_df = FR_df.iloc[:,-2:]

In [316]:
index_level_1 = ['First visit']*30+['After visit']*30
index_level_2= ['Date', 'ProgressNote', 'MedicineName','Memo','체중','골격근량','체지방량','BMI','혈압(고)','혈압(저)','맥박수','체지방률','Height','근육량',
            'InterCellWater','ExtraCellWater','TotalBodyWater','ProteinMass','MineralMass','FatFreeMass','Osseus','ECW_TBW','ECF_TBF','VFA','WHR',
            'WeightControl','FatControl','MuscleControl','BMR','FitnessScore']*2

In [317]:
multi_index = pd.MultiIndex.from_arrays([index_level_1, index_level_2], names=['Number', 'Description'])

In [318]:
df.columns = multi_index

In [312]:
info_df.columns = pd.MultiIndex.from_arrays([[0 for i in range(7)], info_df.columns.tolist()], names=['Number', 'Description'])

TypeError: object of type 'int' has no len()

In [322]:
FR_df = pd.concat([info_df,df],axis=1)

In [324]:
BF.save_excel(FR_df,'RF_df')