### study.csv & primary.csv 합치기

In [1]:
import pandas as pd

In [2]:
study_PATH = r'./data/s41598-020-73558-3_sepsis_survival_study_cohort.csv'
primary_PATH = r'./data/s41598-020-73558-3_sepsis_survival_primary_cohort.csv'

studyDF = pd.read_csv(study_PATH)
primyDF = pd.read_csv(primary_PATH)

# 데이터 로드 확인
print(studyDF.head(5))
print()
print(primyDF.head(5))

   age_years  sex_0male_1female  episode_number  hospital_outcome_1alive_0dead
0          7                  1               1                              1
1         17                  0               2                              1
2         70                  0               1                              1
3         76                  0               1                              1
4          8                  0               1                              1

   age_years  sex_0male_1female  episode_number  hospital_outcome_1alive_0dead
0         21                  1               1                              1
1         20                  1               1                              1
2         21                  1               1                              1
3         77                  0               1                              1
4         72                  0               1                              1


In [3]:
studyDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19051 entries, 0 to 19050
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   age_years                      19051 non-null  int64
 1   sex_0male_1female              19051 non-null  int64
 2   episode_number                 19051 non-null  int64
 3   hospital_outcome_1alive_0dead  19051 non-null  int64
dtypes: int64(4)
memory usage: 595.5 KB


In [4]:
primyDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110204 entries, 0 to 110203
Data columns (total 4 columns):
 #   Column                         Non-Null Count   Dtype
---  ------                         --------------   -----
 0   age_years                      110204 non-null  int64
 1   sex_0male_1female              110204 non-null  int64
 2   episode_number                 110204 non-null  int64
 3   hospital_outcome_1alive_0dead  110204 non-null  int64
dtypes: int64(4)
memory usage: 3.4 MB


#### DF 합치기
- 공통 열 기준으로

In [5]:
totalDF = pd.concat([primyDF, studyDF], ignore_index=True)

print(totalDF.tail(10))

        age_years  sex_0male_1female  episode_number  \
129245         69                  1               2   
129246         29                  0               1   
129247         79                  0               1   
129248          1                  0               1   
129249         77                  0               1   
129250         33                  1               1   
129251         58                  0               1   
129252         44                  0               2   
129253         61                  0               3   
129254         78                  0               1   

        hospital_outcome_1alive_0dead  
129245                              1  
129246                              1  
129247                              1  
129248                              1  
129249                              0  
129250                              1  
129251                              1  
129252                              1  
129253                 

In [6]:
# study + primary DF 저장
# totalDF.to_csv("./data/total_sepsis.csv", index=False)

In [7]:
totalDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129255 entries, 0 to 129254
Data columns (total 4 columns):
 #   Column                         Non-Null Count   Dtype
---  ------                         --------------   -----
 0   age_years                      129255 non-null  int64
 1   sex_0male_1female              129255 non-null  int64
 2   episode_number                 129255 non-null  int64
 3   hospital_outcome_1alive_0dead  129255 non-null  int64
dtypes: int64(4)
memory usage: 3.9 MB


##### 생존/사망 비율

In [None]:
totalDF[totalDF.columns[-1]].value_counts()

In [None]:
import matplotlib.pyplot as plt

lifedeath_counts = list( totalDF['hospital_outcome_1alive_0dead'].value_counts().values )
label_num = list( totalDF['hospital_outcome_1alive_0dead'].value_counts().index )
label_str = ['Alive', 'Dead']

plt.pie(lifedeath_counts, labels=label_num, autopct='%.1f%%', startangle=200, counterclock=False,explode=[0.1, 0.1],
        colors=['lightsteelblue','darksalmon'], shadow=True, textprops = {"fontsize":18})
plt.title('Ratio', fontsize=22)
plt.show()

#### 오버샘플링 예시

In [None]:
# 필요한 라이브러리 불러오기
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt

# 불균형 데이터셋 생성 (양성 클래스가 10%, 음성 클래스가 90%인 경우)
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, 
                           n_redundant=10, n_clusters_per_class=1, 
                           weights=[0.9, 0.1], flip_y=0, random_state=42)

# 재샘플링 전 클래스 분포 확인
print(f"Original dataset shape: {Counter(y)}")

# SMOTE 적용
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 재샘플링 후 클래스 분포 확인
print(f"Resampled dataset shape: {Counter(y_resampled)}")

# 시각화 (2D로 축소하여 표현)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_res_pca = pca.fit_transform(X_resampled)

plt.figure(figsize=(8, 6))
plt.scatter(X_res_pca[y_resampled == 0][:, 0], X_res_pca[y_resampled == 0][:, 1], label='Class 0 (Majority)', alpha=0.6)
plt.scatter(X_res_pca[y_resampled == 1][:, 0], X_res_pca[y_resampled == 1][:, 1], label='Class 1 (Minority)', alpha=0.6, marker='x')
plt.title('SMOTE Resampled Data')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()


In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# 가상 모델 예측 결과와 실제 레이블
y_true = np.array([0, 0, 1, 1, 1, 0, 1, 0, 1, 0])
y_scores = np.array([0.1, 0.4, 0.35, 0.8, 0.65, 0.2, 0.9, 0.1, 0.7, 0.3])

# Precision-Recall 곡선 계산
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)

# PR 곡선 시각화
plt.plot(recall, precision, marker='.', label='PR curve')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.grid(True)
plt.show()

# Threshold에 따른 Precision과 Recall 출력
for t, p, r in zip(thresholds, precision[:-1], recall[:-1]):
    print(f"Threshold: {t:.2f}, Precision: {p:.2f}, Recall: {r:.2f}")
