## 필요 라이브러리 및 데이터셋 로드

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import minmax_scale
from scipy.cluster.hierarchy import dendrogram
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import PCA
import gc
from tqdm.notebook import tqdm
import pickle

In [2]:
#merged csv 불러오기
path = "/Users/younyung.gene/__DSL__/EDA/DATA/merged.csv"
cols = ["가입자 일련번호", "성별코드", "연령대코드", "시도코드", "주상병코드", '요양일수', '입내원일수', '심결가산율', '심결요양급여비용총액', '심결본인부담금',
       '심결보험자부담금', '총처방일수']
df_raw = pd.read_csv(path, usecols = cols, encoding="utf-8")

#데이터프레임 복사
temp = df_raw.copy(deep = True)

#메모리 free
del df_raw
gc.collect()

#["주상병코드"] 데이터 대분류로 축소
temp["주상병코드"] = temp["주상병코드"].str.slice(0,1)

#시는 si, 도는 do로 변환
temp["시도코드"] = temp["시도코드"].replace(
    {11 : "si"
    ,26 : "si"
    ,27 : "si"
    ,28 : "si"
    ,29 : "si"
    ,30 : "si"
    ,31 : "si"
    ,36 : "si"
    ,41 : "do"
    ,42 : "do"
    ,43 : "do"
    ,44 : "do"
    ,45 : "do"
    ,46 : "do"
    ,47 : "do"
    ,48 : "do"
    ,49 : "do"})

#주상병코드, 시도코드 원핫인코딩
temp = pd.get_dummies(temp, columns=["주상병코드", "시도코드"])

#Normalization 해줘야하는 칼럼 리스트로 정리
norm_cols = ["성별코드", "연령대코드", "요양일수", "입내원일수", "심결가산율", "심결요양급여비용총액", "심결본인부담금", "심결보험자부담금", "총처방일수"]

#normalization 수행
for column in norm_cols:
  data = temp[column]
  temp[column] = minmax_scale(data)

#normalization 결과 확인
temp

Unnamed: 0,가입자 일련번호,성별코드,연령대코드,요양일수,입내원일수,심결가산율,심결요양급여비용총액,심결본인부담금,심결보험자부담금,총처방일수,...,주상병코드_O,주상병코드_P,주상병코드_Q,주상병코드_R,주상병코드_S,주상병코드_T,주상병코드_U,주상병코드_Z,시도코드_do,시도코드_si
0,334176,1.0,0.882353,0.004115,0.001190,0.500000,0.000055,0.000071,0.000050,0.033333,...,0,0,0,0,0,0,0,0,1,0
1,334176,1.0,0.882353,0.004115,0.001190,0.500000,0.000055,0.000071,0.000050,0.033333,...,0,0,0,0,0,0,0,0,1,0
2,334209,1.0,0.764706,0.004115,0.001190,0.500000,0.000055,0.000071,0.000050,0.066667,...,0,0,0,0,0,0,0,0,1,0
3,334265,1.0,0.823529,0.004115,0.001190,0.500000,0.000055,0.000071,0.000050,0.066667,...,0,0,0,0,0,0,0,0,1,0
4,335323,1.0,0.764706,0.004115,0.001190,0.500000,0.000055,0.000071,0.000050,0.033333,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11733708,995389,0.0,0.117647,0.008230,0.010714,1.000000,0.005665,0.001727,0.005785,0.000000,...,0,0,0,0,0,0,0,0,1,0
11733709,996077,1.0,0.117647,0.008230,0.003571,0.833333,0.003978,0.002101,0.003969,0.000000,...,0,0,0,0,1,0,0,0,1,0
11733710,997217,0.0,0.117647,0.004115,0.107143,1.000000,0.002877,0.002850,0.002729,0.000000,...,0,0,0,0,0,0,0,0,1,0
11733711,997217,0.0,0.117647,0.045267,0.120238,1.000000,0.023609,0.014582,0.023329,0.000000,...,0,0,0,0,0,0,0,0,1,0


In [3]:
#가입자 일련번호 분리
patient_id = temp["가입자 일련번호"]

#가입자 일련번호 drop
temp.drop(columns = ["가입자 일련번호"], inplace = True)

## PCA 시행

In [4]:
#Scree plot으로 파악한 5개의 PC 산출 (설명되는 variance의 수렴을 보고 휴리스틱으로 자른 pc 개수)
pca = PCA(n_components=5)
principle_componenets = pca.fit_transform(temp)

#새로운 DataFrame 선언
df_pc = pd.DataFrame(data = principle_componenets, columns=['PC1', 'PC2','PC3','PC4','PC5'])

#새 DataFrame 확인
df_pc

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-0.611024,-0.425336,-0.036266,-0.531216,0.777838
1,-0.611024,-0.425336,-0.036266,-0.531216,0.777838
2,-0.609728,-0.416502,-0.069056,-0.505395,0.748821
3,-0.610378,-0.420892,-0.052601,-0.518640,0.763840
4,-0.609724,-0.416555,-0.069176,-0.504725,0.747801
...,...,...,...,...,...
11733708,-0.601380,0.594784,-0.107858,-0.249057,-0.713766
11733709,-0.595948,-0.364836,-0.278877,-0.148622,-0.243852
11733710,-0.606600,0.595205,-0.071691,-0.072977,-0.169822
11733711,-0.608169,0.690763,-0.657745,0.589877,0.053516


In [5]:
#메모리 free
del temp
gc.collect()

0

## KMeans Clustering 시행

In [6]:
#모델 및 클러스터 수 지정, elbow method로 휴리스티컬리 고른 클러스터개수
model = KMeans(n_clusters=16)

#클러스터링 시행
model.fit(df_pc)

KMeans(n_clusters=16)

In [7]:
#클러스터링 객체 저장
with open("/Users/younyung.gene/__DSL__/EDA/_pc5_16clusters_/KMeans_16.pkl", "wb") as f:
    pickle.dump(model, f)

#label 저장
df_pc["cluster"] = model.labels_


In [8]:
#메모리 free
del model
gc.collect()

0

In [9]:
#환자 아이디 재부여
df_pc["가입자 일련번호"] = patient_id

#클러스터링 결과 저장
df_pc.to_csv("/Users/younyung.gene/__DSL__/EDA/_pc5_16clusters_/KMeans_16.csv", index=False)

In [10]:
print("Done!!")

Done!!
