# 병원 외래 방문 데이터 분석

> 원본 데이터

In [9]:
import pandas as pd
import numpy as np

patients = pd.DataFrame({
    "patient_id": [1, 2, 3, 4, 5, 6],
    "sex": ["F", "M", "F", "M", "F", "M"],
    "age": [34, 58, np.nan, 45, 72, 29],
    "city": ["Seoul", "Busan", "Seoul", "Incheon", "Busan", "Seoul"]
})

visits = pd.DataFrame({
    "visit_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
    "patient_id": [1, 1, 2, 3, 4, 5, 5, 7],
    "dept": ["IM", "ENT", "IM", "DERM", "IM", "ENT", "IM", "IM"],
    "visit_date": pd.to_datetime(["2026-01-02","2026-01-10","2026-01-03","2026-01-04",
                                  "2026-01-11","2026-01-05","2026-01-12","2026-01-08"]),
    "cost": [12000, 18000, 25000, 15000, 22000, 16000, 30000, 21000]
})

display(patients)
display(visits)

Unnamed: 0,patient_id,sex,age,city
0,1,F,34.0,Seoul
1,2,M,58.0,Busan
2,3,F,,Seoul
3,4,M,45.0,Incheon
4,5,F,72.0,Busan
5,6,M,29.0,Seoul


Unnamed: 0,visit_id,patient_id,dept,visit_date,cost
0,1001,1,IM,2026-01-02,12000
1,1002,1,ENT,2026-01-10,18000
2,1003,2,IM,2026-01-03,25000
3,1004,3,DERM,2026-01-04,15000
4,1005,4,IM,2026-01-11,22000
5,1006,5,ENT,2026-01-05,16000
6,1007,5,IM,2026-01-12,30000
7,1008,7,IM,2026-01-08,21000


---

### 1. 환자 데이터 체크

- 각 행별 결측치 개수를 row_na_cnt 칼럼으로 추가하고
- 각 열별 결측치 개수를 col_na_cnt 로 계산하기

In [11]:
patients_q1 = patients.copy()

# (1) 행(row)별 결측치 개수
patients_q1["row_na_cnt"] = patients_q1.isna().sum(axis=1)

# (2) 열(column)별 결측치 개수
col_na_cnt = patients_q1.isna().sum(axis=0)

print(f"각 행별 결측치 개수 열 추가:")
display(patients_q1)

print(f"\n각 열별 결측치 개수 계산:")
col_na_cnt

각 행별 결측치 개수 열 추가:


Unnamed: 0,patient_id,sex,age,city,row_na_cnt
0,1,F,34.0,Seoul,0
1,2,M,58.0,Busan,0
2,3,F,,Seoul,1
3,4,M,45.0,Incheon,0
4,5,F,72.0,Busan,0
5,6,M,29.0,Seoul,0



각 열별 결측치 개수 계산:


patient_id    0
sex           0
age           1
city          0
row_na_cnt    0
dtype: int64

---

### 2. 환자별 진료비 요약 및 상위 환자 찾기

- 환자 데이터를 기준으로 환자 데이터와 방문자 데이터를 한 눈에 볼 수 있게 병합하기
- 환자별 총 진료비(sum)와 방문 횟수(count) 구하기

In [12]:
# patients 기준 left merge
patients.merge(visits, on="patient_id", how="left")

Unnamed: 0,patient_id,sex,age,city,visit_id,dept,visit_date,cost
0,1,F,34.0,Seoul,1001.0,IM,2026-01-02,12000.0
1,1,F,34.0,Seoul,1002.0,ENT,2026-01-10,18000.0
2,2,M,58.0,Busan,1003.0,IM,2026-01-03,25000.0
3,3,F,,Seoul,1004.0,DERM,2026-01-04,15000.0
4,4,M,45.0,Incheon,1005.0,IM,2026-01-11,22000.0
5,5,F,72.0,Busan,1006.0,ENT,2026-01-05,16000.0
6,5,F,72.0,Busan,1007.0,IM,2026-01-12,30000.0
7,6,M,29.0,Seoul,,,NaT,


In [15]:
# 위에서 만든 merge 결과를 이용해서
# 환자별 총 진료비(sum) + 환자별 방문 횟수(count)
patient_summary = (
    patients.merge(visits, on="patient_id", how="left")
    .groupby("patient_id")
    .agg(
        total_cost=("cost", "sum"),
        visit_cnt=("visit_id", "count")
    )
)

patient_summary

Unnamed: 0_level_0,total_cost,visit_cnt
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30000.0,2
2,25000.0,1
3,15000.0,1
4,22000.0,1
5,46000.0,2
6,0.0,0
