## Description
 - 환자(person)가 방문(visit_occurrence)을 하고, 해당 방문에서 어떤 진단(condition_occurrence)과 처방(drug_exposure)을 받았는지, 그리고 사망을 했다면 언제 사망(death)했는지에 대한 정보
 - 각 테이블간을 연결하기 위해 중요하게 사용되어야할 공통 Key는 person_id(환자 id)와 visit_occurrence_id(방문 id)입니다.
 - CDM은 concept라는 개념을 사용하고 있습니다. 대부분의 카테고리(범주)형 자료들은 concept_id로 표현이 되어있습니다. 이에 대한 정보(이름 등)를 concept 테이블이나 그와 매칭되는 source_value 컬럼을 확인해야합니다.

## Load_Local_DataFrame
 - pickle 파일명대로 DataFrame 변수를 만들어 할당합니다.

In [2]:
import numpy as np
import pandas as pd
import os, pickle

In [3]:
def search_path_walk(path):
    return [ os.path.join(dirs, file) for dirs, _,files in os.walk(path) for file in files]

In [4]:
data_path_list = search_path_walk('./data/')
data_path_list

['./data/clinical_note.p',
 './data/concept.p',
 './data/condition_occurrence.p',
 './data/death.p',
 './data/drug_exposure.p',
 './data/drug_pair.p',
 './data/person.p',
 './data/visit_occurrence.p']

In [5]:
filename_list = [data_path.split('/')[-1].split('.')[0] for data_path in data_path_list]
print(f"* pickle filename list : \n{filename_list}")

* pickle filename list : 
['clinical_note', 'concept', 'condition_occurrence', 'death', 'drug_exposure', 'drug_pair', 'person', 'visit_occurrence']


In [6]:
clinical_note = pickle.load(open(data_path_list[0], 'rb'))
concept = pickle.load(open(data_path_list[1], 'rb'))
condition_occurrence = pickle.load(open(data_path_list[2], 'rb'))
death = pickle.load(open(data_path_list[3], 'rb'))
drug_exposure = pickle.load(open(data_path_list[4], 'rb'))
drug_pair = pickle.load(open(data_path_list[5], 'rb'))
person = pickle.load(open(data_path_list[6], 'rb'))
visit_occurrence = pickle.load(open(data_path_list[7], 'rb'))

## View Each Tables
 - for문 내부에서 등 DataFrame을 예쁘게 출력하기 위해 `show(df)`라는 함수를 정의해서 사용합니다.
     - print(df), 셀 마지막에 df.head() 작성하는 것 등을 대신합니다.

In [7]:
from IPython.core.display import HTML

def show(df):
    display(HTML(df.to_html()))

### person

In [8]:
print(person.shape)
person.head()

(1000, 18)


Unnamed: 0,person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
0,402435,8532,1997,4,18,1997-04-18,8527,0,,,,a434e3bf-7720-4612-8d18-e274e199f4fd,F,0,white,0,hispanic,0
1,1022983,8507,1950,2,26,1950-02-26,8527,0,,,,e0b46681-1ccf-488e-9766-bbdb1fe53af2,M,0,white,0,hispanic,0
2,1657794,8532,1937,12,10,1937-12-10,8527,0,,,,f6d7bf9f-0881-4142-9dfa-67fa213dd322,F,0,white,0,hispanic,0
3,2845932,8507,1987,5,26,1987-05-26,8515,0,,,,1387a5c0-159c-4dd2-9ef8-4e7f47a6d288,M,0,asian,0,hispanic,0
4,2693038,8532,1959,2,7,1959-02-07,8527,0,,,,8dea9575-d5cc-4a50-ab5e-a8c0de735fcf,F,0,white,0,nonhispanic,0


In [9]:
person.columns

Index(['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth',
       'day_of_birth', 'birth_datetime', 'race_concept_id',
       'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id',
       'person_source_value', 'gender_source_value',
       'gender_source_concept_id', 'race_source_value',
       'race_source_concept_id', 'ethnicity_source_value',
       'ethnicity_source_concept_id'],
      dtype='object')

In [10]:
person['person_id'].nunique()

1000

### concept

In [11]:
print(concept.shape)
concept.head()

(7403692, 10)


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,37068761,Gamma glutamyl transferase/Aspartate aminotran...,Observation,LOINC,LOINC Component,,LP307433-5,1970-01-01,2099-12-31,
1,37068762,Satratoxin IgM | Serum | Microbiology,Measurement,LOINC,LOINC Hierarchy,C,LP379709-1,1970-01-01,2099-12-31,
2,37068763,Signet ring cells,Observation,LOINC,LOINC Component,,LP247841-2,1970-01-01,2099-12-31,
3,37068764,Tryptase.mature | Serum or Plasma | Chemistry ...,Measurement,LOINC,LOINC Hierarchy,C,LP383273-2,1970-01-01,2099-12-31,
4,37068765,Pathology biopsy report | Pancreas | Pathology,Observation,LOINC,LOINC Hierarchy,C,LP401954-5,1970-01-01,2099-12-31,


In [12]:
concept.columns

Index(['concept_id', 'concept_name', 'domain_id', 'vocabulary_id',
       'concept_class_id', 'standard_concept', 'concept_code',
       'valid_start_date', 'valid_end_date', 'invalid_reason'],
      dtype='object')

### others

In [13]:
for df in [condition_occurrence, death, drug_exposure, drug_pair, visit_occurrence]:
    print(df.shape)
    print(df.columns)
    show(df.head())


(12167, 16)
Index(['condition_occurrence_id', 'person_id', 'condition_concept_id',
       'condition_start_date', 'condition_start_datetime',
       'condition_end_date', 'condition_end_datetime',
       'condition_type_concept_id', 'condition_status_concept_id',
       'stop_reason', 'provider_id', 'visit_occurrence_id', 'visit_detail_id',
       'condition_source_value', 'condition_source_concept_id',
       'condition_status_source_value'],
      dtype='object')


Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value
0,1466183,116496,0,2002-11-29,2002-11-29,,NaT,32020,0,,,36112954,0,162864005,4060985,
1,1466184,116496,0,2020-03-04,2020-03-04,2020-03-04,2020-03-04,32020,0,,,36112952,0,840544004,37311060,
2,1466185,116496,81151,2015-03-24,2015-03-24,2015-04-14,2015-04-14,32020,0,,,7021052,0,44465007,81151,
3,1466186,116496,260139,2012-10-23,2012-10-23,2012-10-30,2012-10-30,32020,0,,,36112948,0,10509002,260139,
4,1466187,116496,312437,2020-03-04,2020-03-04,2020-04-04,2020-04-04,32020,0,,,36112952,0,267036007,312437,


(152, 7)
Index(['person_id', 'death_date', 'death_datetime', 'death_type_concept_id',
       'cause_concept_id', 'cause_source_value', 'cause_source_concept_id'],
      dtype='object')


Unnamed: 0,person_id,death_date,death_datetime,death_type_concept_id,cause_concept_id,cause_source_value,cause_source_concept_id
0,1691806,2015-06-02,,32815,0,233604007,0
1,99181,2018-11-04,,32815,0,87433001,0
2,2738610,1998-03-28,,32815,0,262574004,0
3,31196,2020-03-16,,32815,0,840539006,0
4,994339,2006-03-08,,32815,0,22298006,0


(46579, 23)
Index(['drug_exposure_id', 'person_id', 'drug_concept_id',
       'drug_exposure_start_date', 'drug_exposure_start_datetime',
       'drug_exposure_end_date', 'drug_exposure_end_datetime',
       'verbatim_end_date', 'drug_type_concept_id', 'stop_reason', 'refills',
       'quantity', 'days_supply', 'sig', 'route_concept_id', 'lot_number',
       'provider_id', 'visit_occurrence_id', 'visit_detail_id',
       'drug_source_value', 'drug_source_concept_id', 'route_source_value',
       'dose_unit_source_value'],
      dtype='object')


Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
0,40900862,26922,19073183,2017-05-04,2017-05-04 01:41:54,2017-05-18,2017-05-18 01:41:54,2017-05-18,38000177,,0,0.0,14,,0,0,0,99499216,0,308182,19073183,,
1,40757313,2955,40231925,2016-07-24,2016-07-24 13:28:53,2016-09-23,2016-09-23 13:28:53,2016-09-23,38000177,,0,0.0,61,,0,0,0,9251642,0,1049221,40231925,,
2,52808614,2955,40229134,2015-04-02,2015-04-02 13:28:53,2015-04-16,2015-04-16 13:28:53,2015-04-16,38000177,,0,0.0,14,,0,0,0,57618650,0,1043400,40229134,,
3,52808615,2955,1115171,2016-07-24,2016-07-24 13:28:53,2016-08-23,2016-08-23 13:28:53,2016-08-23,38000177,,0,0.0,30,,0,0,0,9251642,0,849574,1115171,,
4,111107864,2955,40213154,2011-04-04,2011-04-04 13:28:53,2011-04-04,2011-04-04 13:28:53,2011-04-04,581452,,0,0.0,0,,0,0,0,57618654,0,140,40213154,,


(15, 2)
Index(['drug_concept_id1', 'drug_concept_id2'], dtype='object')


Unnamed: 0,drug_concept_id1,drug_concept_id2
0,40213154,19078106
1,19078106,40213154
2,19009384,19030765
3,40224172,40213154
4,19127663,19009384


(41810, 17)
Index(['visit_occurrence_id', 'person_id', 'visit_concept_id',
       'visit_start_date', 'visit_start_datetime', 'visit_end_date',
       'visit_end_datetime', 'visit_type_concept_id', 'provider_id',
       'care_site_id', 'visit_source_value', 'visit_source_concept_id',
       'admitted_from_concept_id', 'admitted_from_source_value',
       'discharge_to_source_value', 'discharge_to_concept_id',
       'preceding_visit_occurrence_id'],
      dtype='object')


Unnamed: 0,visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitted_from_concept_id,admitted_from_source_value,discharge_to_source_value,discharge_to_concept_id,preceding_visit_occurrence_id
0,36112943,116496,9202,1962-04-13,1962-04-13 01:53:01,1962-04-13,1962-04-13 02:08:01,44818517,,,ac25f574-971c-4685-8034-359a00b09857,0,0,,0,,
1,36112944,116496,9202,1962-04-24,1962-04-24 01:53:01,1962-04-24,1962-04-24 02:08:01,44818517,,,e7e02d9d-d16a-44cc-b3ce-42a5adb60095,0,0,,0,,36112943.0
2,36112954,116496,9202,2002-11-29,2002-11-29 01:53:01,2002-11-29,2002-11-29 02:23:01,44818517,,,2e49b738-ed61-4b93-bc31-4b305985709e,0,0,,0,,36112944.0
3,36112945,116496,9202,2008-11-06,2008-11-06 01:53:01,2008-11-06,2008-11-06 02:08:01,44818517,,,2eccfac7-4443-4daa-901a-b4be10692426,0,0,,0,,36112954.0
4,36112946,116496,9202,2010-11-19,2010-11-19 01:53:01,2010-11-19,2010-11-19 02:43:01,44818517,,,71a06bd9-3d06-4635-88f4-4fe29305c83f,0,0,,0,,36112945.0


## Question

### visit_occurrence
 - visit_occurrence 테이블은 병원에 방문한 환자들의 방문식별번호(id), 병원 방문 시작일자, 종료일자, 방문 타입(내원, 외래 등) 등 병원 방문과 관련된 정보를 포함하고 있습니다. 
 - 내원일수는 환자가 요양기관을 방문하여 진료를 받은 일수이며, `내원일수 = 방문종료일자 - 방문시작일자 + 1` 으로 계산합니다. 
 - **모든 환자에 대해 총 내원일수**를 구하고 총 내원일수의 최대값과 총 내원일수 최대값을 가지는 환자수를 찾는 쿼리를 작성합니다.
     - 방문시작일자는 visit_start_date, 방문종료일자는 visit_end_date 를 사용합니다.

In [14]:
visit_occurrence.head()

Unnamed: 0,visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitted_from_concept_id,admitted_from_source_value,discharge_to_source_value,discharge_to_concept_id,preceding_visit_occurrence_id
0,36112943,116496,9202,1962-04-13,1962-04-13 01:53:01,1962-04-13,1962-04-13 02:08:01,44818517,,,ac25f574-971c-4685-8034-359a00b09857,0,0,,0,,
1,36112944,116496,9202,1962-04-24,1962-04-24 01:53:01,1962-04-24,1962-04-24 02:08:01,44818517,,,e7e02d9d-d16a-44cc-b3ce-42a5adb60095,0,0,,0,,36112943.0
2,36112954,116496,9202,2002-11-29,2002-11-29 01:53:01,2002-11-29,2002-11-29 02:23:01,44818517,,,2e49b738-ed61-4b93-bc31-4b305985709e,0,0,,0,,36112944.0
3,36112945,116496,9202,2008-11-06,2008-11-06 01:53:01,2008-11-06,2008-11-06 02:08:01,44818517,,,2eccfac7-4443-4daa-901a-b4be10692426,0,0,,0,,36112954.0
4,36112946,116496,9202,2010-11-19,2010-11-19 01:53:01,2010-11-19,2010-11-19 02:43:01,44818517,,,71a06bd9-3d06-4635-88f4-4fe29305c83f,0,0,,0,,36112945.0


In [15]:
# 내원 정보가 있는 환자 수를 확인. person 테이블과 마찬가지의 1000명
visit_occurrence['person_id'].nunique()

1000

In [16]:
# date칼럼이 datetime형식이 datetime타입이 아니므로 날짜 계산을 위해 변환 필요
visit_occurrence['visit_start_date'].dtypes

dtype('O')

In [17]:
# 내원시작/종료일자의 dtypes변환
visit_occurrence[['visit_start_date', 'visit_end_date']] = visit_occurrence[['visit_start_date', 'visit_end_date']].apply( pd.to_datetime )

In [18]:
# 데이터마다 내원종료일 - 내원시작일 + 1을 계산하여
# 총 내원일수를 계산해놓는다.
visit_occurrence['visit_total_days'] = \
visit_occurrence['visit_end_date'] - visit_occurrence['visit_start_date'] \
+ pd.to_timedelta(1, unit='d')

In [19]:
visit_occurrence[['visit_start_date', 'visit_end_date', 'visit_total_days']].head()

Unnamed: 0,visit_start_date,visit_end_date,visit_total_days
0,1962-04-13,1962-04-13,1 days
1,1962-04-24,1962-04-24,1 days
2,2002-11-29,2002-11-29,1 days
3,2008-11-06,2008-11-06,1 days
4,2010-11-19,2010-11-19,1 days


In [20]:
# 총 내원일수는 1일부터 몇백일까지 다양하다.
visit_occurrence['visit_total_days'].value_counts().head()

1 days     39715
2 days      1765
3 days        42
11 days       31
10 days       23
Name: visit_total_days, dtype: int64

#### 환자별 총 내원일자


In [22]:
# 환자별 총 내원일수 합을 구하기 위해
# 환자별로 groupby한 데이터의 샘플을 뽑아서 확인한다.
grouped = dict(list(visit_occurrence.groupby(['person_id'])))
# grouped.keys()

In [None]:
# 환자 1명에 대해 내원시작일순으로 정렬한다.
one_person = grouped[2955].sort_values(by=['visit_start_date'])
one_person.head()

In [None]:
# 한명의 총 내원일자를 확인해본다. 
one_person['visit_total_days'].sum()

In [None]:
# 이제 그룹별로 환자의 총 내원일수를 계산하고,
person_to_visit_total_count = \
visit_occurrence.groupby(['person_id']).apply(lambda agg_df:agg_df['visit_total_days'].sum())

In [None]:
# 총 내원일수 최대인, 최소인 환자의 index(person_id)를 확인한다.
max_visit_ids = person_to_visit_total_count[person_to_visit_total_count == person_to_visit_total_count.max()].index.values
min_visit_ids = person_to_visit_total_count[person_to_visit_total_count == person_to_visit_total_count.min()].index.values
max_visit_ids, min_visit_ids

In [None]:
# 환자별로 총 내원일자 표를 확인한다.(상위 10명만)
print("환자별 총 내원일자")
show(person_to_visit_total_count.to_frame('총 내원일자')[:10])

In [86]:
str(person_to_visit_total_count.max())

'18873 days 00:00:00'

In [88]:
# 최대값 최소값을 가지는 환자의 person_id와 그 값을 출력한다.
print("총 내원일수 최대값과 최소값을 가지는 환자 수")
print(f"""
최대값 : {person_to_visit_total_count.max()}, 환자 id:{max_visit_ids}, 환자 수:{len(max_visit_ids)}
최소값 : {person_to_visit_total_count.min()}, 환자 id:{min_visit_ids}, 환자 수:{len(min_visit_ids)}
""")


총 내원일수의 최대값과 최소값을 가지는 환자 수

최대값 : 18873 days 00:00:00, 환자 id:[1059760], 환자 수:1
최소값 : 2 days 00:00:00, 환자 id:[ 215966  709963 1638422 1737987], 환자 수:4



### condition_occurrence

 - 환자들이 진단 받은 상병 내역 중 첫글자는 (a,b,c,d,e) 문자로 시작하고 “heart” 단어가 포함된 상병 이름을 찾으려고 합니다. 
 - condition_occurrence 테이블은 환자가 병원 방문시 진단 받은 질환이 담겨있습니다. 상병코드는 condition_concept_id이고, concept 테이블의 concept_id와 조인하여 상병 이름을 찾을 수 있습니다. (concept_name 컬럼 사용)
    - 문자 검색시 대소문자를 구분하지 않습니다.
    - 상병 이름을 중복없이 나열합니다.

In [16]:
condition_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value
0,1466183,116496,0,2002-11-29,2002-11-29,,NaT,32020,0,,,36112954,0,162864005,4060985,
1,1466184,116496,0,2020-03-04,2020-03-04,2020-03-04,2020-03-04,32020,0,,,36112952,0,840544004,37311060,
2,1466185,116496,81151,2015-03-24,2015-03-24,2015-04-14,2015-04-14,32020,0,,,7021052,0,44465007,81151,
3,1466186,116496,260139,2012-10-23,2012-10-23,2012-10-30,2012-10-30,32020,0,,,36112948,0,10509002,260139,
4,1466187,116496,312437,2020-03-04,2020-03-04,2020-04-04,2020-04-04,32020,0,,,36112952,0,267036007,312437,


In [17]:
concept.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,37068761,Gamma glutamyl transferase/Aspartate aminotran...,Observation,LOINC,LOINC Component,,LP307433-5,1970-01-01,2099-12-31,
1,37068762,Satratoxin IgM | Serum | Microbiology,Measurement,LOINC,LOINC Hierarchy,C,LP379709-1,1970-01-01,2099-12-31,
2,37068763,Signet ring cells,Observation,LOINC,LOINC Component,,LP247841-2,1970-01-01,2099-12-31,
3,37068764,Tryptase.mature | Serum or Plasma | Chemistry ...,Measurement,LOINC,LOINC Hierarchy,C,LP383273-2,1970-01-01,2099-12-31,
4,37068765,Pathology biopsy report | Pancreas | Pathology,Observation,LOINC,LOINC Hierarchy,C,LP401954-5,1970-01-01,2099-12-31,


In [18]:
# 진단 테이블에 concept테이블을 join하여 진단명을 얻는다.
# unique한 진단수는 151개로 확인된다.
condition_with_concept = \
pd.merge(condition_occurrence, concept,
         left_on="condition_concept_id", right_on='concept_id', how='left')


condition_with_concept['concept_name'].nunique()

151

In [19]:
condition_with_concept.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,...,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,1466183,116496,0,2002-11-29,2002-11-29,,NaT,32020,0,,...,0,No matching concept,Metadata,,Undefined,,No matching concept,1970-01-01,2099-12-31,
1,1466184,116496,0,2020-03-04,2020-03-04,2020-03-04,2020-03-04,32020,0,,...,0,No matching concept,Metadata,,Undefined,,No matching concept,1970-01-01,2099-12-31,
2,1466185,116496,81151,2015-03-24,2015-03-24,2015-04-14,2015-04-14,32020,0,,...,81151,Sprain of ankle,Condition,SNOMED,Clinical Finding,S,44465007,1970-01-01,2099-12-31,
3,1466186,116496,260139,2012-10-23,2012-10-23,2012-10-30,2012-10-30,32020,0,,...,260139,Acute bronchitis,Condition,SNOMED,Clinical Finding,S,10509002,1970-01-01,2099-12-31,
4,1466187,116496,312437,2020-03-04,2020-03-04,2020-04-04,2020-04-04,32020,0,,...,312437,Dyspnea,Condition,SNOMED,Clinical Finding,S,267036007,1970-01-01,2099-12-31,


In [20]:
# 검색을 위해 진단명을 소문자로 매핑한 칼럼을 생성한다.
condition_with_concept['concept_name(lower)'] = condition_with_concept['concept_name'].map(str.lower)

In [21]:
# 진단명 중 abcde 시작하며, heart라는 단어를 포함하는 데이터를 필터링 하기 위해 mask를 만든다.
# 첫글자는 abcde로 시작하고 & heart를 포함시킨 데이터는 48개로 확인도니다.
condition_mask = \
(condition_with_concept['concept_name(lower)'].str[0].isin(list('abcde')))\
& \
(condition_with_concept['concept_name(lower)'].str.contains('heart'))

condition_mask.sum()

48

In [22]:
# 필터링 후 중복제거해서 확인한다.
wanted_condition = condition_with_concept['concept_name(lower)'].loc[condition_mask].unique()
print(f"abcde로 시작하고 heart를 포함하는 진단명(들) : {list(wanted_condition)}")

abcde로 시작하고 heart를 포함하는 진단명(들) : ['chronic congestive heart failure']


### drug_exposure

 - drug_exposure 테이블은 환자가 병원에서 처방받은 약의 종류와 처방시작일과 종료일에 대한 정보를 포함하고 있습니다. 
 - 환자번호 ‘`1891866`’ 환자의 약 처방 데이터에서 `처방된 약의 종류별`로 `처음 시작일, 마지막 종료일, 복용일(마지막종료일과 처음시작일의 차이)`을 구하고 `복용일이 긴 순으로 정렬`하여 테이블을생성합니다.
    - 환자번호 : person_id, 
    - 약의 종류 : drug_concept_id, 
    - 처방시작일 :drug_exposure_start_date, 
    - 처방종료일 : drug_exposure_end_date

In [49]:
drug_exposure.head()

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,...,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
0,40900862,26922,19073183,2017-05-04,2017-05-04 01:41:54,2017-05-18,2017-05-18 01:41:54,2017-05-18,38000177,,...,0,0,0,99499216,0,308182,19073183,,,15 days
1,40757313,2955,40231925,2016-07-24,2016-07-24 13:28:53,2016-09-23,2016-09-23 13:28:53,2016-09-23,38000177,,...,0,0,0,9251642,0,1049221,40231925,,,62 days
2,52808614,2955,40229134,2015-04-02,2015-04-02 13:28:53,2015-04-16,2015-04-16 13:28:53,2015-04-16,38000177,,...,0,0,0,57618650,0,1043400,40229134,,,15 days
3,52808615,2955,1115171,2016-07-24,2016-07-24 13:28:53,2016-08-23,2016-08-23 13:28:53,2016-08-23,38000177,,...,0,0,0,9251642,0,849574,1115171,,,31 days
4,111107864,2955,40213154,2011-04-04,2011-04-04 13:28:53,2011-04-04,2011-04-04 13:28:53,2011-04-04,581452,,...,0,0,0,57618654,0,140,40213154,,,1 days


In [50]:
# 날짜칼럼들의 dtypes가 object로 되어있어 계산용이를 위해 datetime형태로 변환해야한다.
drug_exposure['drug_exposure_start_date'].dtypes

dtype('<M8[ns]')

In [51]:
drug_exposure[['drug_exposure_start_date','drug_exposure_end_date']] = \
drug_exposure[['drug_exposure_start_date','drug_exposure_end_date']].apply( pd.to_datetime)

In [52]:
# 각 데이터마다 복용종료일 - 복용시작일 + 1일로 총 복용기간을 계산한다.
drug_exposure['drug_exposure_total_days'] = \
drug_exposure['drug_exposure_end_date'] - drug_exposure['drug_exposure_start_date'] \
+ pd.to_timedelta(1, unit='d')

In [54]:
# 문제에서 제시한 특정 환자 데이터만 추출한다.
patient_1891866 = drug_exposure.loc[drug_exposure['person_id'] ==1891866 ]

print(patient_1891866.shape)
patient_1891866.head()

(1469, 24)


Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,...,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
29761,62232837,1891866,19009384,1959-12-01,1959-12-01 18:51:00,1959-12-01,1959-12-01 18:51:00,1959-12-01,38000177,,...,0,0,0,27063979,0,106892,19009384,,,1 days
29762,62232856,1891866,19009384,1965-02-02,1965-02-02 18:51:00,1965-04-20,1965-04-20 18:51:00,1965-04-20,38000177,,...,0,0,0,27063992,0,106892,19009384,,,78 days
29763,62232857,1891866,19009384,1965-04-20,1965-04-20 18:51:00,1965-04-27,1965-04-27 18:51:00,1965-04-27,38000177,,...,0,0,0,27063993,0,106892,19009384,,,8 days
29764,62232858,1891866,19009384,1965-04-27,1965-04-27 18:51:00,1965-11-16,1965-11-16 18:51:00,1965-11-16,38000177,,...,0,0,0,27064141,0,106892,19009384,,,204 days
29765,62232859,1891866,19009384,1965-11-16,1965-11-16 18:51:00,1966-02-15,1966-02-15 18:51:00,1966-02-15,38000177,,...,0,0,0,27063994,0,106892,19009384,,,92 days


In [55]:
# 해당환자의 약 종류(drug_cept_id)별 데이터를 처리하기 위해
# 해당환자의 특정약에 대한 데이터를 뽑아서 확인해본다.
grouped = dict(list(patient_1891866.groupby(['drug_concept_id'])))
grouped.keys()

dict_keys([1539463, 19009384, 19030765, 40213154, 40213227])

In [56]:
sample_drug = grouped[1539463]
sample_drug.head()

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,...,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
29872,62233149,1891866,1539463,1990-03-13,1990-03-13 18:51:00,1998-03-11,1998-03-11 18:51:00,1998-03-11,38000177,,...,0,0,0,27062929,0,314231,1539463,,,2921 days
30039,62233422,1891866,1539463,1992-03-12,1992-03-12 18:51:00,1993-03-12,1993-03-12 18:51:00,1993-03-12,38000177,,...,0,0,0,27063174,0,314231,1539463,,,366 days
30251,62233289,1891866,1539463,1991-03-13,1991-03-13 18:51:00,1992-03-12,1992-03-12 18:51:00,1992-03-12,38000177,,...,0,0,0,27063052,0,314231,1539463,,,366 days
30335,62233556,1891866,1539463,1993-03-12,1993-03-12 18:51:00,1994-03-12,1994-03-12 18:51:00,1994-03-12,38000177,,...,0,0,0,27063297,0,314231,1539463,,,366 days
30408,62233690,1891866,1539463,1994-03-12,1994-03-12 18:51:00,1995-03-12,1995-03-12 18:51:00,1995-03-12,38000177,,...,0,0,0,27063419,0,314231,1539463,,,366 days


In [57]:
# 처방약별 복용시작일 순으로 정렬한다.
sample_drug = sample_drug.sort_values(by=['drug_exposure_start_date'])
sample_drug.head()

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,...,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
29872,62233149,1891866,1539463,1990-03-13,1990-03-13 18:51:00,1998-03-11,1998-03-11 18:51:00,1998-03-11,38000177,,...,0,0,0,27062929,0,314231,1539463,,,2921 days
30251,62233289,1891866,1539463,1991-03-13,1991-03-13 18:51:00,1992-03-12,1992-03-12 18:51:00,1992-03-12,38000177,,...,0,0,0,27063052,0,314231,1539463,,,366 days
30039,62233422,1891866,1539463,1992-03-12,1992-03-12 18:51:00,1993-03-12,1993-03-12 18:51:00,1993-03-12,38000177,,...,0,0,0,27063174,0,314231,1539463,,,366 days
30335,62233556,1891866,1539463,1993-03-12,1993-03-12 18:51:00,1994-03-12,1994-03-12 18:51:00,1994-03-12,38000177,,...,0,0,0,27063297,0,314231,1539463,,,366 days
30408,62233690,1891866,1539463,1994-03-12,1994-03-12 18:51:00,1995-03-12,1995-03-12 18:51:00,1995-03-12,38000177,,...,0,0,0,27063419,0,314231,1539463,,,366 days


In [58]:
# 최초 복용 시작일
sample_drug['drug_exposure_start_date'].min()

Timestamp('1990-03-13 00:00:00')

In [59]:
# 마지막 복용일
sample_drug['drug_exposure_end_date'].max()

Timestamp('1998-03-11 00:00:00')

In [60]:
# 약별 총 복용일
sample_drug['drug_exposure_total_days'].sum()

Timedelta('5484 days 00:00:00')

In [61]:
# 각 처방약별 집계시
# 위에서 확인한 3개의 데이터가 반환되도록 함수를 작성한다.
def get_start_end_total_exposure(agg_df):
    temp_df = agg_df.sort_values(by=['drug_exposure_start_date'])
    start_ = temp_df['drug_exposure_start_date'].min()
    end_ = temp_df['drug_exposure_end_date'].max()
    total_ = temp_df['drug_exposure_total_days'].sum()
    return pd.DataFrame({
        '처음_시작일':start_,
        '마지막_종료일':end_,
        '총_복용일':total_
    }, index=temp_df.index).iloc[0]

In [62]:
# 특정환자의 처방약별로 3개의 데이터를 추출한 뒤,
# 총 복용일이 긴 순으로 내림차순 정렬한다.
patient_1891866 = patient_1891866.groupby(['drug_concept_id']).apply(lambda agg_df : get_start_end_total_exposure(agg_df))
# patient_1891866 = patient_1891866.sort_values(by=['총_복용일'], ascending=False)
show(patient_1891866)

Unnamed: 0_level_0,처음_시작일,마지막_종료일,총_복용일
drug_concept_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1539463,1990-03-13,1998-03-11,5484 days
19009384,1959-12-01,1998-10-06,14424 days
19030765,1988-10-18,1998-10-05,1214 days
40213154,1989-09-12,1998-07-07,10 days
40213227,1993-01-05,1993-01-05,1 days


### drug_exposure 2

 - `drug_exposure` 테이블은 환자가 병원에서 처방받은 약의 종류와 처방시작일과 종료일에 대한 정보를 포함하고 있습니다. 
     - drug_exposure 테이블로부터 선택된 15가지의 약 번호와 약품명이 저장된 첫번째 `drugs` 테이블이 있으며, 
     - 15가지 약 별로 drug_exposure에 저장된 처방건수가 저장된 두번째 `prescription_count` 테이블이 있습니다. 
     - 마지막으로 drugs 테이블에 해당되는 15가지 약별로 가장 많이 처방되는 약을 짝지어 놓은 `drug_pair` 테이블이 있습니다. 
     
     
 - 3개의 테이블을 사용하여 **짝지어진 두번째 약의 처방 건수가 첫번째 약의 처방 건수보다 더 많은 첫번째 약의 약품명을 처방건수 순으로 출력**합니다.
    - drugs : drug_concept_id(첫번째약 번호), concept_name(약품명)
    - prescription_count : drug_concept_id(첫번째약 번호), cnt(처방건수)
    - drug_pair : drug_concept_id1(첫번째약 번호), drug_concept_id2(두번째약 번호)
    - 아래 쿼리를 활용하세요.
    
    ```sql
with drug_list as (
select distinct drug_concept_id, concept_name, count(*) as cnt from
synthea_cdm_1000.drug_exposure de
join synthea_cdm_1000.concept
on drug_concept_id = concept_id
where concept_id in (
40213154,19078106,19009384,40224172,19127663,1511248,40169216,1539463,
19126352,1539411,1332419,40163924,19030765,19106768,19075601)
group by drug_concept_id,concept_name
order by count(*) desc
)
, drugs as (select drug_concept_id, concept_name from drug_list)
, prescription_count as (select drug_concept_id, cnt from drug_list)
```

In [63]:
# hint에서 주어진 쿼리를 활용하여 drug_list 테이블 생성
# 이후 drug_list테이블 -> drugs, prescription_count 테이블이 생성된다.

In [67]:
%load_ext sql

In [23]:
%env DATABASE_URL=postgresql://username:password@port/database

env: DATABASE_URL=postgresql://username:password@port/database


In [69]:
%%sql drug_list <<

select distinct drug_concept_id, concept_name, count(*) as cnt from
drug_exposure de
join concept
on drug_concept_id = concept_id
where concept_id in (
40213154,19078106,19009384,40224172,19127663,1511248,40169216,1539463,
19126352,1539411,1332419,40163924,19030765,19106768,19075601)
group by drug_concept_id,concept_name
order by count(*) desc

15 rows affected.
Returning data to local variable drug_list


In [70]:
drug_list = drug_list.DataFrame()
drug_list

Unnamed: 0,drug_concept_id,concept_name,cnt
0,40213154,"Influenza, seasonal, injectable, preservative ...",8015
1,19078106,hydrochlorothiazide 25 MG Oral Tablet,3669
2,19009384,"insulin isophane, human 70 UNT/ML / insulin, r...",2825
3,40224172,amlodipine 5 MG / hydrochlorothiazide 12.5 MG ...,2814
4,19127663,atenolol 50 MG / chlorthalidone 25 MG Oral Tab...,2729
5,1511248,NDA020503 200 ACTUAT albuterol 0.09 MG/ACTUAT ...,1802
6,40169216,120 ACTUAT fluticasone propionate 0.044 MG/ACT...,1716
7,1539463,simvastatin 10 MG Oral Tablet,1681
8,19126352,nitroglycerin 0.4 MG/ACTUAT Mucosal Spray,1486
9,1539411,simvastatin 20 MG Oral Tablet,1318


In [73]:
# 총 164개의 drug_concept_id중에 15개만 추출됨.
print("전체 약의 갯수 : ",drug_list['drug_concept_id'].nunique())
print("분석 약의 갯수 : ",drug_exposure['drug_concept_id'].nunique())

전체 약의 갯수 :  15
분석 약의 갯수 :  164


In [75]:
# drug_list에서 필요한 칼럼만 각각 추출시
# drugs테이블과 prescription_count테이블이 생성된다.
drugs = drug_list[['drug_concept_id', 'concept_name']]
drugs

Unnamed: 0,drug_concept_id,concept_name
0,40213154,"Influenza, seasonal, injectable, preservative ..."
1,19078106,hydrochlorothiazide 25 MG Oral Tablet
2,19009384,"insulin isophane, human 70 UNT/ML / insulin, r..."
3,40224172,amlodipine 5 MG / hydrochlorothiazide 12.5 MG ...
4,19127663,atenolol 50 MG / chlorthalidone 25 MG Oral Tab...
5,1511248,NDA020503 200 ACTUAT albuterol 0.09 MG/ACTUAT ...
6,40169216,120 ACTUAT fluticasone propionate 0.044 MG/ACT...
7,1539463,simvastatin 10 MG Oral Tablet
8,19126352,nitroglycerin 0.4 MG/ACTUAT Mucosal Spray
9,1539411,simvastatin 20 MG Oral Tablet


In [76]:
prescription_count = drug_list[['drug_concept_id', 'cnt']]
prescription_count

Unnamed: 0,drug_concept_id,cnt
0,40213154,8015
1,19078106,3669
2,19009384,2825
3,40224172,2814
4,19127663,2729
5,1511248,1802
6,40169216,1716
7,1539463,1681
8,19126352,1486
9,1539411,1318


In [77]:
# 분석에 활용될 또다른 테이블 drug_pair는 원래 주어진 테이블이다.
drug_pair.head()

Unnamed: 0,drug_concept_id1,drug_concept_id2
0,40213154,19078106
1,19078106,40213154
2,19009384,19030765
3,40224172,40213154
4,19127663,19009384


In [78]:
# drug_pari테이블에 prescription_count의 정보를 left join으로 붙인다.
# drug_concept_id1와 drug_concept_id2의 count가 row별로 각각 표기되어야하므로
# 2번의 join(merge)를 거쳐야 한다.
drug_pair_count_df = \
pd.merge(drug_pair, prescription_count,
         left_on='drug_concept_id1',right_on='drug_concept_id',
         how='left').drop(['drug_concept_id'],axis=1)\
        .rename(columns={'cnt':'id1_count'})\
    .merge(prescription_count,
          left_on='drug_concept_id2', right_on='drug_concept_id')\
        .rename(columns={'cnt':'id2_count'}).drop(['drug_concept_id'],axis=1)
            
drug_pair_count_df

Unnamed: 0,drug_concept_id1,drug_concept_id2,id1_count,id2_count
0,40213154,19078106,8015,3669
1,40163924,19078106,1235,3669
2,19078106,40213154,3669,8015
3,40224172,40213154,2814,8015
4,19106768,40213154,1199,8015
5,19009384,19030765,2825,1214
6,1539463,19030765,1681,1214
7,19127663,19009384,2729,2825
8,19030765,19009384,1214,2825
9,1511248,40169216,1802,1716


In [79]:
# 2번째 약의 처방건수가 많은 경우만 필터링한다.
id2_win_df = \
drug_pair_count_df.loc[drug_pair_count_df['id2_count'] > drug_pair_count_df['id1_count']]

In [82]:
# 여기서 필요한 것은 첫번째약의 이름과 처방건수다. 해당 칼럼만 필터링 이후 정렬한다.
id2_win_id1_count_df = \
id2_win_df.merge(drugs,
                left_on='drug_concept_id1', right_on='drug_concept_id')\
                .drop(['drug_concept_id2', 'id2_count', 'drug_concept_id'],axis=1)

id2_win_id1_count_df.columns=['첫번째약_번호', '처방건수', '약품명']
id2_win_id1_count_df = id2_win_id1_count_df[['약품명', '처방건수']].sort_values(by=['처방건수'], ascending=False)
show(id2_win_id1_count_df)

Unnamed: 0,약품명,처방건수
1,hydrochlorothiazide 25 MG Oral Tablet,3669
2,amlodipine 5 MG / hydrochlorothiazide 12.5 MG / olmesartan medoxomil 20 MG Oral Tablet,2814
4,atenolol 50 MG / chlorthalidone 25 MG Oral Tablet [Tenoretic],2729
6,120 ACTUAT fluticasone propionate 0.044 MG/ACTUAT Metered Dose Inhaler,1716
7,simvastatin 20 MG Oral Tablet,1318
8,amlodipine 5 MG Oral Tablet,1247
0,24 HR metformin hydrochloride 500 MG Extended Release Oral Tablet,1235
5,1 ML epoetin alfa 4000 UNT/ML Injection [Epogen],1214
3,hydrochlorothiazide 12.5 MG Oral Tablet,1199
9,clopidogrel 75 MG Oral Tablet,1164


### 제 2형 당뇨병 진단 후 Metformin 90일 이상 복용 환자 수 추출
 -  제 2형 당뇨병을 진단받은 환자 중에
    - 당뇨환자의 condition_concept_id 는 다음을 사용합니다.
        - 3191208,36684827,3194332,3193274,43531010,4130162,45766052,45757474,4099651,4129519,4063043,4230254,4193704,4304377,201826,3194082,3192767
 - 18세 이상의 환자 중에
 - 진단을 받은 이후 Metformin을 90일 이상 복용한 환자수
     - i. drug_concept_id 는 40163924 를 사용합니다.

In [86]:
# 전체 환자 1000명 중 진단 테이블에 얼마나 존재하는지 확인
# -> 7명은 진단이 없었음.
# 전체 환자 1000명 중 7명은 진단이 없었다.
print("전체 환자 수 :",condition_occurrence['person_id'].nunique())
# 진단이 없는 환자 목록
print("진단명이 없는 환자 수 :",list(set(person['person_id']) - set(condition_occurrence['person_id'])))

전체 환자 수 : 993
진단명이 없는 환자 수 : [470594, 1737987, 218372, 709963, 2813715, 1638422, 215966]


In [87]:
# 전체 진단은 151종류이며, 제 2형당뇨병 진단은 주어진데로 17종류의 진다명을 사용한다.
print("전체 진단명 수 :",condition_occurrence['condition_concept_id'].nunique())
# 이 가운데 제2형 당뇨병 진단명은 17가지
diabetes_concept_ids = [3191208,36684827,3194332,3193274,43531010,4130162,45766052,45757474,4099651,4129519,4063043,4230254,4193704,4304377,201826,3194082,3192767]
print("제 2형 당뇨병 진단명 :",len(diabetes_concept_ids))

전체 진단명 수 : 151
제 2형 당뇨병 진단명 : 17


In [88]:
# 진단 데이터 중 제 2형 당뇨병 진단받은 데이터는 56가지이다.
diabetes_occurrence = condition_occurrence.loc[condition_occurrence['condition_concept_id'].isin(diabetes_concept_ids)]
print("제 2형 당뇨병 진단 데이터 수:",diabetes_occurrence.shape[0])

제 2형 당뇨병 진단 데이터 수: 56


In [89]:
diabetes_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value
135,18852812,1496597,201826,2003-06-25,2003-06-25,,NaT,32020,0,,,118098027,0,44054006,201826,
183,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,,25713132,0,44054006,201826,
253,28897593,2293584,201826,2018-10-07,2018-10-07,,NaT,32020,0,,,87633778,0,44054006,201826,
782,29253135,2321793,201826,2016-01-19,2016-01-19,,NaT,32020,0,,,40870122,0,44054006,201826,
991,17065310,1354811,201826,1970-04-24,1970-04-24,,NaT,32020,0,,,102058365,0,44054006,201826,


In [90]:
# 제 2형 당뇨병 진단받은 환자수도 56명이다.(중복 진단 데이터는 없음)
print("제 2형 당뇨병 진단 환자 수:",diabetes_occurrence['person_id'].nunique())

제 2형 당뇨병 진단 환자 수: 56


In [91]:
# 진단 정보에 person_id를 이용해 person테이블의 나이정보를 붙인다.
diabetes_with_person_data = \
pd.merge(diabetes_occurrence, person,
         on='person_id')

print(diabetes_with_person_data.shape)
diabetes_with_person_data.head()

(56, 33)


Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,...,location_id,provider_id_y,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
0,18852812,1496597,201826,2003-06-25,2003-06-25,,NaT,32020,0,,...,,,,db904705-77c3-46d3-8aa6-61bd8937e46d,M,0,white,0,nonhispanic,0
1,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,,,,1262e880-c0f5-4035-9997-6ca6feaa788e,M,0,white,0,nonhispanic,0
2,28897593,2293584,201826,2018-10-07,2018-10-07,,NaT,32020,0,,...,,,,99851e74-1cfd-40cb-92f3-1955a5b997b0,F,0,asian,0,nonhispanic,0
3,29253135,2321793,201826,2016-01-19,2016-01-19,,NaT,32020,0,,...,,,,336de484-6a50-4467-bc84-bdb1ad54cff4,M,0,white,0,nonhispanic,0
4,17065310,1354811,201826,1970-04-24,1970-04-24,,NaT,32020,0,,...,,,,b8da5c1a-08eb-454e-a134-f838d83e4883,M,0,white,0,nonhispanic,0


In [92]:
# 출생년원일을 이용해서 만 나이를 계산(만18세이상)하기 위해, datetime의 타입을 바꾼다.
diabetes_with_person_data['birth_datetime'] = diabetes_with_person_data['birth_datetime'].apply(pd.to_datetime)

In [93]:
# 만나이 = 올해년도 - 출생년도  (-1 if 월,일 안지났으면)
def calculate_age(born):
    today = pd.Timestamp('today')
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    return age

In [94]:
# 만 나이의 정보가 담긴 age칼럼을 생성한다.
diabetes_with_person_data['age'] = diabetes_with_person_data['birth_datetime'].apply( calculate_age ) 
# 18세 이상의 환자만 필터링한다.
# 제외되는 환자없이 56명 그대로 유지.
diabetes_with_person_higher_18 = diabetes_with_person_data.loc [ diabetes_with_person_data['age']>=18]
diabetes_with_person_higher_18.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,...,provider_id_y,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id,age
0,18852812,1496597,201826,2003-06-25,2003-06-25,,NaT,32020,0,,...,,,db904705-77c3-46d3-8aa6-61bd8937e46d,M,0,white,0,nonhispanic,0,55
1,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,,,1262e880-c0f5-4035-9997-6ca6feaa788e,M,0,white,0,nonhispanic,0,76
2,28897593,2293584,201826,2018-10-07,2018-10-07,,NaT,32020,0,,...,,,99851e74-1cfd-40cb-92f3-1955a5b997b0,F,0,asian,0,nonhispanic,0,50
3,29253135,2321793,201826,2016-01-19,2016-01-19,,NaT,32020,0,,...,,,336de484-6a50-4467-bc84-bdb1ad54cff4,M,0,white,0,nonhispanic,0,36
4,17065310,1354811,201826,1970-04-24,1970-04-24,,NaT,32020,0,,...,,,b8da5c1a-08eb-454e-a134-f838d83e4883,M,0,white,0,nonhispanic,0,89


In [97]:
# 진단받은 이후의 drug_concept_id=40163924(Metformin)복용기록이 90일 이상 찾아야한다.
# 진단일 정보에 drug_exposure의 복용정보를 join한다.
diabetes_and_drugs = \
diabetes_with_person_higher_18.merge( drug_exposure,
    on=['person_id'], how='left'
)
# 당뇨병 진단환자들마다 복용기록이 붙어 데이터가 늘어난다. 12748 rows
print("데이터 수:", diabetes_and_drugs.shape[0])

(12748, 57)
56


In [114]:
# 먼저, Metformin 복용기록이 있는 데이터만 필터링한다.
metformin_mask = diabetes_and_drugs['drug_concept_id'].isin([40163924])
diabetes_and_metformin = diabetes_and_drugs.loc[metformin_mask]
diabetes_and_metformin.shape

(1235, 57)

In [115]:
diabetes_and_metformin.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason_x,...,route_concept_id,lot_number,provider_id,visit_occurrence_id_y,visit_detail_id_y,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
38,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713156,0,860975,40163924,,,372 days
40,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713157,0,860975,40163924,,,372 days
42,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713158,0,860975,40163924,,,372 days
44,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713159,0,860975,40163924,,,372 days
46,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713160,0,860975,40163924,,,113 days


In [156]:
# 진단일이후로 복약시작하여 총 복약일수가 90일이 넘어야한다.
# 진단일 <= 복용시작일로 진단이후 복용한 날짜를 필터링해야한다.
# 이 때, 복용시작 ~ 복용종료일 사이에 진단이 내려질 수 있으니 그것도 고려해야한다.
#    1) 진단일 < 복용시작일
#    2) 복용시작일 <= 진단일 <= 복용종료일 : 진단일~복용종료일로 복용일수를 세야한다.
# 날짜칼럼들을 계산을 위해 type을 변경하고,  2가지 경우에 대해 mask를 만들고 필터링한다.

diabetes_and_metformin[['condition_start_date', 'drug_exposure_start_date', 'drug_exposure_end_date']] = \
diabetes_and_metformin[['condition_start_date', 'drug_exposure_start_date', 'drug_exposure_end_date']].apply(pd.to_datetime)

metformin_after_condition_mask = \
(diabetes_and_metformin['condition_start_date'] < diabetes_and_metformin['drug_exposure_start_date']) \

metformin_in_condition_mask = \
((diabetes_and_metformin['condition_start_date'] >= diabetes_and_metformin['drug_exposure_start_date'])
&    
(diabetes_and_metformin['condition_start_date'] <= diabetes_and_metformin['drug_exposure_end_date']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [157]:
metformin_after_diabetes = \
diabetes_and_metformin.loc[metformin_after_condition_mask|metformin_in_condition_mask]
print(metformin_after_diabetes.shape)
metformin_after_diabetes.head()

(1228, 57)


Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason_x,...,route_concept_id,lot_number,provider_id,visit_occurrence_id_y,visit_detail_id_y,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
38,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713156,0,860975,40163924,,,372 days
40,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713157,0,860975,40163924,,,372 days
42,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713158,0,860975,40163924,,,372 days
44,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713159,0,860975,40163924,,,372 days
46,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713160,0,860975,40163924,,,113 days


In [158]:
# 총 복용일 계산에 필요한 칼럼들만 추출한다.
metformin_after_diabetes = metformin_after_diabetes[['person_id','condition_start_date', 'drug_exposure_start_date','drug_exposure_end_date']] 
metformin_after_diabetes

Unnamed: 0,person_id,condition_start_date,drug_exposure_start_date,drug_exposure_end_date
38,31196,1983-02-03,2008-06-25,2009-07-01
40,31196,1983-02-03,2009-07-01,2010-07-07
42,31196,1983-02-03,2010-07-07,2011-07-13
44,31196,1983-02-03,2011-07-13,2012-07-18
46,31196,1983-02-03,2012-07-18,2012-11-07
...,...,...,...,...
12652,2170146,1970-02-18,2016-07-13,2017-07-19
12654,2170146,1970-02-18,2017-07-19,2018-07-25
12656,2170146,1970-02-18,2018-07-25,2019-07-31
12658,2170146,1970-02-18,2019-07-31,2020-04-22


In [161]:
# 2가지 case마다 총 복용일 계산법이 다르므로 mask마다 필터링해서 나눈 뒤,
# 각각 계산한다.
case1_df = metformin_after_diabetes.loc[metformin_after_condition_mask]
case1_df.shape

(1213, 4)

In [162]:
case2_df = metformin_after_diabetes.loc[metformin_in_condition_mask]
case2_df.shape

(15, 4)

In [149]:
case1_df.head()

Unnamed: 0,person_id,condition_start_date,drug_exposure_start_date,drug_exposure_end_date
38,31196,1983-02-03,2008-06-25,2009-07-01
40,31196,1983-02-03,2009-07-01,2010-07-07
42,31196,1983-02-03,2010-07-07,2011-07-13
44,31196,1983-02-03,2011-07-13,2012-07-18
46,31196,1983-02-03,2012-07-18,2012-11-07


In [163]:
# case1) 진단일 < 복용시작일인 경우, 복용시작일 ~ 복용종료일을 다 계산한다
case1_df['metformin_복용일']=\
case1_df['drug_exposure_end_date'] - case1_df['drug_exposure_start_date']  + pd.to_timedelta(1, unit='d')

case1_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case1_df['metformin_복용일']=\


Unnamed: 0,person_id,condition_start_date,drug_exposure_start_date,drug_exposure_end_date,metformin_복용일
38,31196,1983-02-03,2008-06-25,2009-07-01,372 days
40,31196,1983-02-03,2009-07-01,2010-07-07,372 days
42,31196,1983-02-03,2010-07-07,2011-07-13,372 days
44,31196,1983-02-03,2011-07-13,2012-07-18,372 days
46,31196,1983-02-03,2012-07-18,2012-11-07,113 days


In [164]:
case2_df.head()

Unnamed: 0,person_id,condition_start_date,drug_exposure_start_date,drug_exposure_end_date
1360,609120,1988-03-27,1988-03-26,1989-04-01
1770,2143829,1994-07-20,1994-07-19,1995-07-25
1940,487607,2016-11-29,2016-11-28,2017-12-04
1963,1826955,2018-07-25,2018-07-25,2018-07-25
1964,1826955,2018-07-25,2018-07-25,2019-07-31


In [165]:
# case2)  복용시작 <= 진단일 <= 복용종료인 경우, 진단일~복용종료일까지 계산한다.
case2_df['metformin_복용일']=\
case2_df['drug_exposure_end_date'] - case2_df['condition_start_date']  + pd.to_timedelta(1, unit='d')

case2_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case2_df['metformin_복용일']=\


Unnamed: 0,person_id,condition_start_date,drug_exposure_start_date,drug_exposure_end_date,metformin_복용일
1360,609120,1988-03-27,1988-03-26,1989-04-01,371 days
1770,2143829,1994-07-20,1994-07-19,1995-07-25,371 days
1940,487607,2016-11-29,2016-11-28,2017-12-04,371 days
1963,1826955,2018-07-25,2018-07-25,2018-07-25,1 days
1964,1826955,2018-07-25,2018-07-25,2019-07-31,372 days


In [167]:
# 2가지 배반 case를 합친다.
total_metformin_after_diabetes = pd.concat([case1_df, case2_df])

In [172]:
# 환자별로 metformin_복용일을 총 복용일을 합산한 series를 생성한다.
total_metformin_after_diabetes_count_series = \
total_metformin_after_diabetes.groupby(['person_id'])['metformin_복용일'].sum()
# 90일이상 복용한 환자만 필터링 한다.
total_metformin_after_diabetes_count_than_90_series = \
total_metformin_after_diabetes_count_series.loc[total_metformin_after_diabetes_count_series.dt.days >=90]

# 90일이상 복용한 환자들의 id값들을 list로 변환하여 추출한다.
print("Metformin을 90일 이상 복용한 환자 수 :",len(total_metformin_after_diabetes_count_than_90_series.index.unique()))
# 90일이상 복용한 환자들의 id값들을 list로 변환하여 추출한다.
total_metformin_after_diabetes_count_than_90_list = total_metformin_after_diabetes_count_than_90_series.index.tolist()
print("Metformin을 90일 이상 복용한 환자의 person_id 리스트 : \n",total_metformin_after_diabetes_count_than_90_list)

Metformin을 90일 이상 복용한 환자 수 : 30
Metformin을 90일 이상 복용한 환자의 person_id 리스트 : 
 [31196, 50663, 67212, 176640, 478532, 487607, 495973, 510173, 531690, 537462, 609120, 843873, 892185, 909084, 1102377, 1134605, 1317600, 1444791, 1578321, 1743075, 1819367, 1821726, 1826955, 2074366, 2143829, 2170146, 2400845, 2452672, 2537704, 2694671]


### 제 2형 당뇨병 진단 환자의 의약품 처방 변경 패턴
 - 전 문제의 첫번째 항목(제 2형 당뇨병을 진단받은 환자)에서 추출한 환자군의 의약품 처방이 변경된 패턴을 추출하고, 그 빈도의 내림차순으로 나열합니다.
    - drug_concept_id는 다음을 사용합니다.
        - digoxin: 19018935
        - simvastatin: 1539411,1539463
        - clopidogrel: 19075601
        - naproxen: 1115171
 - 처방 패턴의 예시는 a->b->c 과 같이 나타낼 수 있습니다.
    - 예를 들어, a->a->b->c->c 는 위 패턴에 해당합니다. 그러나 a->b->a->b->c 는 위 패턴에 해당하지 않습니다.
    - 처방 패턴의 빈도는 a->b->c 와 같은 패턴의 처방을 받은 환자가 몇명인지로 정의합니다.

 - 같은 날 처방된 약은 한 그룹으로 묶습니다.
     - 괄호가 같은 날의 처방을 나타낸다면, 처방 패턴은 (a, b)->c 와 같이나타낼 수 있습니다.

 - 데이터에 나타나는 모든 패턴의 빈도를 집계하고, 결과 예시는 다음과같습니다.
     ![image-20210731230011126](https://raw.githubusercontent.com/is3js/screenshots/main/image-20210731230011126.png)

In [173]:
print(diabetes_occurrence.shape)
diabetes_occurrence.head()

(56, 16)


Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value
135,18852812,1496597,201826,2003-06-25,2003-06-25,,NaT,32020,0,,,118098027,0,44054006,201826,
183,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,,25713132,0,44054006,201826,
253,28897593,2293584,201826,2018-10-07,2018-10-07,,NaT,32020,0,,,87633778,0,44054006,201826,
782,29253135,2321793,201826,2016-01-19,2016-01-19,,NaT,32020,0,,,40870122,0,44054006,201826,
991,17065310,1354811,201826,1970-04-24,1970-04-24,,NaT,32020,0,,,102058365,0,44054006,201826,


In [174]:
# 제 2형 당뇨병 환자 데이터에 복용 정보를 join하고, 해당 의약품 4가지 정보만 필터링 한다.
# digoxin: 19018935
# simvastatin: 1539411,1539463
# clopidogrel: 19075601
# naproxen: 1115171
diabetes_with_four_drugs = \
diabetes_occurrence.merge( drug_exposure,
    on=['person_id'], how='left'
)
four_drugs_mask = diabetes_with_four_drugs['drug_concept_id'].isin([19018935, 1539411, 1539463, 19075601, 1115171])
diabetes_with_four_drugs = diabetes_with_four_drugs.loc[four_drugs_mask]

print(diabetes_with_four_drugs.shape)
diabetes_with_four_drugs.head()

(1577, 39)


Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason_x,...,route_concept_id,lot_number,provider_id_y,visit_occurrence_id_y,visit_detail_id_y,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
21,18852812,1496597,201826,2003-06-25,2003-06-25,,NaT,32020,0,,...,0,0,0,118098012,0,849574,1115171,,,1 days
39,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713092,0,314231,1539463,,,366 days
41,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713093,0,314231,1539463,,,366 days
43,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713094,0,314231,1539463,,,366 days
45,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713096,0,314231,1539463,,,366 days


In [183]:
# 약품을 concept_id -> a,b,c,d로 매핑한다.
# digoxin(19018935) -> a  
# simvastatin(1539411,1539463) -> b
# clopidogrel(19075601) -> c
# naproxen(1115171) -> d
four_drug_mapping = {
    19018935 : 'a',
    1539411 : 'b',
    1539463 : 'b',
    19075601 : 'c',
    1115171 : 'd',
}
diabetes_with_four_drugs['drug_concept_id'] = diabetes_with_four_drugs['drug_concept_id'].map( four_drug_mapping )
diabetes_with_four_drugs['drug_concept_id'].value_counts()

b    726
c    528
a    303
d     20
Name: drug_concept_id, dtype: int64

In [184]:
# 복용시작일 순으로 패턴을 파악해야한다. 
# datetime 형태면 index관리가 어려우므로 복용시작일 칼럼의 type을 string으로 바꿔준다.
diabetes_with_four_drugs['drug_exposure_start_date'] = diabetes_with_four_drugs['drug_exposure_start_date'].astype(str)

In [24]:
# 환자별 복용시작일별 약물의 패턴들을 먼저 발견해야한다.
# 환자별, 복용시작일별 예시 데이터를 하나 뽑아본다.
grouped = dict(list(diabetes_with_four_drugs.groupby(['person_id', 'drug_exposure_start_date'])))

# 아래에러는 제출전 DB관련 정보 삭제후 주석정리를 위해 실행시켜 난 에러임.

NameError: name 'diabetes_with_four_drugs' is not defined

In [186]:
# 환자별, 날짜마다 처방약을 찾는다. 처방약이 1개면 그대로 반환하면 된지만, 2개 이상인 경우, 중복제거하고 모아두어야한다.
one_drug_person = grouped[(31196, '1997-05-14')]
one_drug_person

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason_x,...,route_concept_id,lot_number,provider_id_y,visit_occurrence_id_y,visit_detail_id_y,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
114,391684,31196,201826,1983-02-03,1983-02-03,,NaT,32020,0,,...,0,0,0,25713079,0,314231,1539463,,,8031 days


In [187]:
one_drug_person = grouped[(31196, '1997-05-14')][['drug_concept_id', 'drug_exposure_start_date', 'drug_exposure_end_date']]
one_drug_person

Unnamed: 0,drug_concept_id,drug_exposure_start_date,drug_exposure_end_date
114,b,1997-05-14,2019-05-09


In [188]:
# 처방약이 2개 이상이면, 중복검사 + ( , ) 튜플형태로 만들어야한다.
two_more_drug_person = grouped[(2833968, '2019-09-09')]
two_more_drug_person

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason_x,...,route_concept_id,lot_number,provider_id_y,visit_occurrence_id_y,visit_detail_id_y,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value,drug_exposure_total_days
9390,35704969,2833968,201826,1947-01-06,1947-01-06,,NaT,32020,0,,...,0,0,0,48525400,0,309362,19075601,,,36 days
9393,35704969,2833968,201826,1947-01-06,1947-01-06,,NaT,32020,0,,...,0,0,0,48525400,0,197604,19018935,,,36 days
10147,35704969,2833968,201826,1947-01-06,1947-01-06,,NaT,32020,0,,...,0,0,0,48525400,0,312961,1539411,,,36 days


In [189]:
# 처방약이 2개 이상이면, 중복검사 + ( , ) 튜플형태로 만들어야한다.
two_more_drug_person = grouped[(2833968, '2019-09-09')][['drug_concept_id', 'drug_exposure_start_date', 'drug_exposure_end_date']]
two_more_drug_person

Unnamed: 0,drug_concept_id,drug_exposure_start_date,drug_exposure_end_date
9390,c,2019-09-09,2019-10-14
9393,a,2019-09-09,2019-10-14
10147,b,2019-09-09,2019-10-14


In [190]:
# 환자별, 복용시작일별로 정리된 상태에서 약의 종류가 2가지 이상인 경우, 
# 약물들을 뽑아서 중복을 제거하고 모아두어야한다.
set(two_more_drug_person['drug_concept_id'].values)

{'a', 'b', 'c'}

In [191]:
# 환자별-복용시작일별 같은날에 복용한 의약품은 순서대로 모아두고, 콤마(,)로 묶어서 string으로 반환되도록 한다.
def set_drugs_today(agg_df):
    return ', '.join(sorted(list(set(agg_df['drug_concept_id'].values))))

In [196]:
patterns =diabetes_with_four_drugs.groupby(['person_id', 'drug_exposure_start_date']).apply( set_drugs_today )
patterns

person_id  drug_exposure_start_date
31196      1997-05-14                        b
           1998-05-14                        b
           1999-05-14                        b
           2000-05-13                        b
           2001-05-13                        b
                                        ...   
2833968    2019-09-09                  a, b, c
           2019-10-14                  a, b, c
           2019-11-18                  a, b, c
           2019-12-09                  a, b, c
           2020-01-06                  a, b, c
Length: 869, dtype: object

In [197]:
# index를 제거하면서 당일 처방받은 의약품들을 시간순으로 연결할 준비를 한다.
patterns = patterns.reset_index().rename(columns={0:'pattern'})
patterns

Unnamed: 0,person_id,drug_exposure_start_date,pattern
0,31196,1997-05-14,b
1,31196,1998-05-14,b
2,31196,1999-05-14,b
3,31196,2000-05-13,b
4,31196,2001-05-13,b
...,...,...,...
864,2833968,2019-09-09,"a, b, c"
865,2833968,2019-10-14,"a, b, c"
866,2833968,2019-11-18,"a, b, c"
867,2833968,2019-12-09,"a, b, c"


In [440]:
# 패턴에 null이 없는지 검사한다.
patterns[0].fillna('None').value_counts()

b, c       346
b          200
a, b, c    168
a          129
d           20
c            5
a, b         1
Name: 0, dtype: int64

In [198]:
# 환자별로 묶은 다음, pattern예시를 확인한다.
grouped = dict(list(patterns.groupby(['person_id'])))
# grouped.keys()

dict_keys([31196, 50663, 67212, 170280, 347825, 478532, 487607, 537462, 609120, 631283, 843873, 892185, 939449, 1116539, 1444791, 1496597, 1537987, 1737916, 1743075, 1891866, 1979909, 2074366, 2170146, 2369278, 2452672, 2537704, 2577809, 2694671, 2833968])

In [199]:
grouped[347825]

Unnamed: 0,person_id,drug_exposure_start_date,pattern
28,347825,1999-11-15,d
29,347825,2008-10-16,"b, c"
30,347825,2008-11-20,"b, c"
31,347825,2008-12-11,b
32,347825,2009-01-29,"b, c"
...,...,...,...
118,347825,2020-01-23,"b, c"
119,347825,2020-02-27,"b, c"
120,347825,2020-04-02,"b, c"
121,347825,2020-04-23,"b, c"


In [200]:
grouped[347825]['pattern'].values

array(['d', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
       'b', 'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c',
       'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c',
       'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b', 'b, c',
       'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
       'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
       'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c',
       'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b', 'b, c',
       'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
       'b', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
       'b, c', 'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c',
       'b, c', 'b, c', 'b, c'], dtype=object)

In [467]:
# np.array(list('bbcabbaa'))

array(['b', 'b', 'c', 'a', 'b', 'b', 'a', 'a'], dtype='<U1')

In [494]:
# # 직전과 중복을 허용하지 않도록 데이터를 모은다.
# test_list = np.array(['d', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
#        'b', 'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c',
#        'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c',
#        'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b', 'b, c',
#        'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
#        'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
#        'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c', 'b, c',
#        'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b', 'b, c',
#        'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
#        'b', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c', 'b, c',
#        'b, c', 'b, c', 'b, c', 'b, c', 'b', 'b, c', 'b, c', 'b, c',
#        'b, c', 'b, c', 'b, c'])

# list_ = []
# for i in range(len(test_list)):
#     if i == 0 or (i>1 and test_list[i] != test_list[i-1]):
#         if len(test_list[i]) > 1:
#             list_.append( '(' + test_list[i] + ')' )
#         else:
#             list_.append(test_list[i])
        
# list_

['d',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)',
 'b',
 '(b, c)']

In [201]:
# 순서대로 의약품들을 모을 때, 아래와 같은 함수를 작성한다.
# 1) 앞에 것과 동일한 의약품은 건너띔
# 2) 2개이상의 의약품일 경우 괄호로 묶음
# 3) 1),2)를 만족하면서 화살표(->)로 묶는다.
def sum_no_duplicated(s):
    list_ = s.values
    new_list_ = []
    for i in range(len(list_)):
        if i == 0 or (i>1 and list_[i] != list_[i-1]):
            if len(list_[i]) > 1:
                new_list_.append( '(' + list_[i] + ')' )
            else:
                new_list_.append(list_[i])
    return '->'.join(new_list_)

In [202]:
# 환자_id(index)별로 pattern들이 나타나도록 DataFrame을 만든다.
patterns = patterns.groupby(['person_id'])['pattern'].apply(sum_no_duplicated).to_frame('pattern')
patterns

Unnamed: 0_level_0,pattern
person_id,Unnamed: 1_level_1
31196,b->d->b
50663,d
67212,d
170280,d
347825,"d->b->(b, c)->b->(b, c)->b->(b, c)->b->(b, c)-..."
478532,"b->(b, c)"
487607,b->d->b
537462,d->b
609120,b
631283,b


In [204]:
# 패턴별로 환자수를 value_count()를 활용해서 센다.
pattern_count = patterns['pattern'].value_counts().sort_values(ascending=False).to_frame('person_count')
pattern_count.index.name='pattern'
show(pattern_count)

Unnamed: 0_level_0,person_count
pattern,Unnamed: 1_level_1
d,10
b->d->b,4
b,4
"(b, c)",2
b->c,1
d->b,1
"b->(b, c)",1
b->d,1
a->d->a,1
"b->c->(b, c)->a->(a, b, c)->c->(a, b, c)->c->(a, b, c)",1


### note에서 데이터 추출
 - 제공되는 clinical note 테이블은 한 환자의 의료 기록 샘플입니다. 
 - note로부터 정보를 추출하여 아래의 4개의 테이블에 입력합니다. 추출/입력에 사용한 코드와 결과테이블을 제출해야 합니다.
     - 제공된 note에서 "CONTINUING" 부분 제외
     ![image-20210801003033395](https://raw.githubusercontent.com/is3js/screenshots/main/image-20210801003033395.png)
 
 - 아래 제공되는 스키마에 테이블 생성
     - 테이블 생성 위치는 지원자의 스키마 아래
 - note 에서 추출한 정보를 테이블에 입력
     - sql, python 등 지원자가 익숙한 언어 사용하여 추출
     - 테이블에 입력 시 아래 규칙 적용
        - 규칙1: 내원일자, 처방일자 >= 환자의 생년월일
        - 규칙2: 환자 id는 랜덤하게 중복없이 부여
        - 규칙3: 각 테이블의 id는 랜덤하게 중복없이 부여
 - note에서 정보가 정확히 추출되었는지 검토
     - 예: note에서 발견된 질병 키워드의 개수와 condition_occurrence 에 저장된 행 수의 비교

In [510]:
sample_note = clinical_note['note'][0]

In [511]:
import pprint

pprint.pprint(sample_note)

('Andrea7 Wolf938\n'
 'Race:                White\n'
 'Ethnicity:           Non-Hispanic\n'
 'Gender:              M\n'
 'Age:                 55\n'
 'Birth Date:          1965-04-22\n'
 'Marital Status:      M\n'
 '--------------------------------------------------------------------------------\n'
 'ALLERGIES:\n'
 'No Known Allergies\n'
 '--------------------------------------------------------------------------------\n'
 'ENCOUNTER\n'
 '2011-06-20 : Encounter at Cape Cod Vet Center : Encounter for Acute '
 'bronchitis (disorder)\n'
 'Type: ambulatory\n'
 '   \n'
 '   MEDICATIONS:\n'
 '  2011-06-20 : Acetaminophen 325 MG Oral Tablet for Acute bronchitis '
 '(disorder)\n'
 '   \n'
 '   CONDITIONS:\n'
 '  2011-06-20 : Acute bronchitis (disorder)\n'
 '   \n'
 '   CARE PLANS:\n'
 '  2011-06-20 : Respiratory therapy\n'
 '                         Reason: Acute bronchitis (disorder)\n'
 '                         Activity: Recommendation to avoid exercise\n'
 '                         Activit

In [515]:
sample_note_splits = \
sample_note.split('--------------------------------------------------------------------------------\n')

len(sample_note_splits)

5

In [516]:
# 뒤에서 2번째 것만 제외하면 됨.
sample_note_splits[-2]

'CONTINUING\n   \n   CONDITIONS:\n  1977-04-10 : Chronic sinusitis (disorder)\n  1984-06-07 : Major depression disorder\n  1996-07-04 : Body mass index 30+ - obesity (finding)\n   \n   MEDICATIONS:\n  1984-06-07 + FLUoxetine 20 MG Oral Capsule for Major depression, single episode\n   \n   CAREPLANS:\n  1984-06-07 : Mental health care plan\n                         Reason: Major depression, single episode\n                         Activity: Initial psychiatric interview with mental status and evaluation\n  1984-06-07 : Psychiatry care plan\n                         Reason: Major depression disorder\n                         Activity: Psychiatric Follow-up\n                         Activity: Coping Support Management\n                         Activity: Suicide Risk Assessment\n                         Activity: Cognitive behavioral therapy by multidisciplinary team\n                         Activity: Psychodynamic Interpersonal Therapy\n   \n'

In [522]:
# pandas로 split후 제외할 부분 제외시키기
split_str = '--------------------------------------------------------------------------------\n'

note_df = clinical_note['note'].str.split(split_str, expand=True)
note_df.head()

Unnamed: 0,0,1,2,3,4
0,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2011-06-20 : Encounter at Cape Cod ...,CONTINUING\n \n CONDITIONS:\n 1977-04-10 ...,
1,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2015-04-23 : Encounter at Cape Cod ...,CONTINUING\n \n CONDITIONS:\n 1977-04-10 ...,
2,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2015-04-23 : Encounter at Cape Cod ...,CONTINUING\n \n CONDITIONS:\n 1977-04-10 ...,
3,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2013-05-16 : Encounter at Cape Cod ...,CONTINUING\n \n CONDITIONS:\n 1977-04-10 ...,
4,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2011-05-12 : Encounter at Cape Cod ...,CONTINUING\n \n CONDITIONS:\n 1977-04-10 ...,


In [523]:
note_df = note_df.drop([3,4],axis=1)
note_df.head()

Unnamed: 0,0,1,2
0,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2011-06-20 : Encounter at Cape Cod ...
1,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2015-04-23 : Encounter at Cape Cod ...
2,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2015-04-23 : Encounter at Cape Cod ...
3,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2013-05-16 : Encounter at Cape Cod ...
4,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2011-05-12 : Encounter at Cape Cod ...


In [524]:
note_df.columns=['person_info', 'allergies_info', 'encounter_info']
note_df.head()

Unnamed: 0,person_info,allergies_info,encounter_info
0,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2011-06-20 : Encounter at Cape Cod ...
1,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2015-04-23 : Encounter at Cape Cod ...
2,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2015-04-23 : Encounter at Cape Cod ...
3,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2013-05-16 : Encounter at Cape Cod ...
4,Andrea7 Wolf938\n===============\nRace: ...,ALLERGIES:\nNo Known Allergies\n,ENCOUNTER\n2011-05-12 : Encounter at Cape Cod ...


In [527]:
sample_person_info = note_df.iloc[0].values[0]
sample_person_info



In [582]:
# 정규표현식 버전
sample_person_str = [ x for x in sample_person_info.split('\n') if len(x) > 1]

import re
 
    
for line in sample_person_str:
    match = re.match( "([A-Z]{1}.*):\s*(\S*)", line )
    if match != None:
        print(match.group(1),' - ', match.group(2))

    

Race  -  White
Ethnicity  -  Non-Hispanic
Gender  -  M
Age  -  55
Birth Date  -  1965-04-22
Marital Status  -  M


In [546]:
# list 버전
[ x.lower().split(':') for x in sample_person_info.replace(' ','').split('\n') if ':' in x  ]

[['race', 'white'],
 ['ethnicity', 'non-hispanic'],
 ['gender', 'm'],
 ['age', '55'],
 ['birthdate', '1965-04-22'],
 ['maritalstatus', 'm']]

In [588]:
sample_allergies_info = note_df.iloc[0].values[1]
sample_allergies_str= [ x for x in sample_allergies_info.split('\n') if len(x) > 1]

[ x.replace(':','') for x in sample_allergies_str ]

['ALLERGIES', 'No Known Allergies']

In [593]:
sample_encounter_info = note_df.iloc[0].values[2]
# 정규표현식 버전
sample_encounter_str = [ x.strip() for x in sample_encounter_info.split('\n') if len(x.strip()) > 1]
sample_encounter_str
# import re
    
# for line in sample_person_str:
#     match = re.match( "([A-Z]{1}.*):\s*(\S*)", line )
#     if match != None:
#         print(match.group(1),' - ', match.group(2))

['ENCOUNTER',
 '2011-06-20 : Encounter at Cape Cod Vet Center : Encounter for Acute bronchitis (disorder)',
 'Type: ambulatory',
 'MEDICATIONS:',
 '2011-06-20 : Acetaminophen 325 MG Oral Tablet for Acute bronchitis (disorder)',
 'CONDITIONS:',
 '2011-06-20 : Acute bronchitis (disorder)',
 'CARE PLANS:',
 '2011-06-20 : Respiratory therapy',
 'Reason: Acute bronchitis (disorder)',
 'Activity: Recommendation to avoid exercise',
 'Activity: Deep breathing and coughing exercises',
 'REPORTS:',
 'OBSERVATIONS:',
 'PROCEDURES:',
 '2011-06-20 : Sputum examination (procedure) for Acute bronchitis (disorder)',
 'IMMUNIZATIONS:',
 'IMAGING STUDIES:']

In [567]:
sample_encounter_info = note_df.iloc[0].values[2]

[ x.strip() for x in sample_encounter_info.split('\n') if len(x.strip())>1]

# [ x.lower().replace(':','') for x in sample_allergies_info.replace(' ','').split('\n')[:-1] ]

['ENCOUNTER',
 '2011-06-20 : Encounter at Cape Cod Vet Center : Encounter for Acute bronchitis (disorder)',
 'Type: ambulatory',
 'MEDICATIONS:',
 '2011-06-20 : Acetaminophen 325 MG Oral Tablet for Acute bronchitis (disorder)',
 'CONDITIONS:',
 '2011-06-20 : Acute bronchitis (disorder)',
 'CARE PLANS:',
 '2011-06-20 : Respiratory therapy',
 'Reason: Acute bronchitis (disorder)',
 'Activity: Recommendation to avoid exercise',
 'Activity: Deep breathing and coughing exercises',
 'REPORTS:',
 'OBSERVATIONS:',
 'PROCEDURES:',
 '2011-06-20 : Sputum examination (procedure) for Acute bronchitis (disorder)',
 'IMMUNIZATIONS:',
 'IMAGING STUDIES:']