## Quest
- NSC_BND_M20 대상(from mongo)
- 목표변수 : 범주형, 설명변수 : 5개 선택
- train_test_split() 이용
- classification_report() 사용(각각 train, test set) 평가 의견 첨부
- option : NSC_BND_M20_preprocss.ipynb 이용가능

### 전처리

In [1]:
import pymongo as mg
import pandas as pd
import re

In [2]:
client = mg.MongoClient(host='mongodb://localhost:27017')

In [3]:
database = client['db_NHIS']
collection = database['merged_collection6']

In [4]:
cursor = collection.find({})
list_BND_M20 = list(cursor)

In [5]:
df_BND_M20 = pd.DataFrame(list_BND_M20)
df_BND_M20[:2]

Unnamed: 0,_id,RN_INDI,BTH_YYYY,DTH_YYYYMM,COD1,COD2,RN_KEY,RN_INST,MDCARE_STRT_DT,FORM_CD,...,OPRTN_YN,MDCARE_DD_CNT,VSHSP_DD_CNT,TOT_PRSC_DD_CNT,MCARE_RSLT_TYPE,EDC_ADD_RT,ED_RC_TOT_AMT,EDC_SBA,EDC_INSUR_BRDN_AMT,STD_YYYY
0,64e5d0efe31b2222fb5bb2b7,294364,1921LE,200707.0,T08-T14,W00-W19,2002080298703,26452,20020830,3,...,0,1,1,3,5.0,0.15,10590,3000,7590,2002
1,64e5d0efe31b2222fb5bb2b8,294364,1921LE,200707.0,T08-T14,W00-W19,2002090466395,63715,20020910,3,...,0,1,1,2,5.0,0.15,16520,4950,11570,2002


#### 목표변수와 설명변수 추출
- 목표변수 : 'SICK_SYM1' - 주요 질병 증상
- 설명 변수 :
'TOT_PRSC_DD_CNT' - 총 처방전 공제 횟수
'MCARE_RSLT_TYPE' - 의료 보험 결과 유형
'EDC_ADD_RT' - 의료 보험 부가율
'ED_RC_TOT_AMT' - 의료 보험 청구 총액
'EDC_SBA' - 의료 보험 자기 부담금

In [6]:
df_BND_M20.columns

Index(['_id', 'RN_INDI', 'BTH_YYYY', 'DTH_YYYYMM', 'COD1', 'COD2', 'RN_KEY',
       'RN_INST', 'MDCARE_STRT_DT', 'FORM_CD', 'MCARE_SUBJ_CD', 'SICK_SYM1',
       'OFIJ_TYPE', 'OPRTN_YN', 'MDCARE_DD_CNT', 'VSHSP_DD_CNT',
       'TOT_PRSC_DD_CNT', 'MCARE_RSLT_TYPE', 'EDC_ADD_RT', 'ED_RC_TOT_AMT',
       'EDC_SBA', 'EDC_INSUR_BRDN_AMT', 'STD_YYYY'],
      dtype='object')

In [7]:
df_BND_M20_ex = df_BND_M20[['SICK_SYM1', 'TOT_PRSC_DD_CNT', 'MCARE_RSLT_TYPE', 'EDC_ADD_RT', 'ED_RC_TOT_AMT', 'EDC_SBA']]
df_BND_M20_ex 

Unnamed: 0,SICK_SYM1,TOT_PRSC_DD_CNT,MCARE_RSLT_TYPE,EDC_ADD_RT,ED_RC_TOT_AMT,EDC_SBA
0,L028,3,5.0,0.15,10590,3000
1,J030,2,5.0,0.15,16520,4950
2,J030,3,5.0,0.15,27620,9000
3,K291,3,5.0,0.15,12920,3000
4,M545,0,5.0,0.15,14230,3000
...,...,...,...,...,...,...
137158,J209,0,1.0,0.15,11280,2300
137159,J209,0,1.0,0.15,19640,3700
137160,J209,0,1.0,0.15,13470,2800
137161,J209,0,1.0,0.15,13530,2600


In [8]:
df_BND_M20_ex.isnull().sum()

SICK_SYM1             0
TOT_PRSC_DD_CNT       0
MCARE_RSLT_TYPE    3205
EDC_ADD_RT            0
ED_RC_TOT_AMT         0
EDC_SBA               0
dtype: int64

In [9]:
df_BND_M20_ex[:2]

Unnamed: 0,SICK_SYM1,TOT_PRSC_DD_CNT,MCARE_RSLT_TYPE,EDC_ADD_RT,ED_RC_TOT_AMT,EDC_SBA
0,L028,3,5.0,0.15,10590,3000
1,J030,2,5.0,0.15,16520,4950


In [10]:
def convertSICK_SYM(sick_sym) : 
    if len(sick_sym) < 3 :
        return None
    else : 
        return sick_sym[:3]

In [11]:
convertSICK_SYM('J209'), convertSICK_SYM('F_') 

('J20', None)

In [12]:
df_BND_M20_ex['SICK_SYM2'] = df_BND_M20_ex['SICK_SYM1'].apply(convertSICK_SYM)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_BND_M20_ex['SICK_SYM2'] = df_BND_M20_ex['SICK_SYM1'].apply(convertSICK_SYM)


In [13]:
df_BND_M20_ex = df_BND_M20_ex.dropna()

In [14]:
df_BND_M20_ex['SICK_SYM2'].value_counts()

J20    8890
I10    7638
J03    4168
M54    3708
J06    3382
       ... 
G24       1
G26       1
J70       1
S49       1
P28       1
Name: SICK_SYM2, Length: 728, dtype: int64

#### SICK_SYM1의 values 전처리
- 머신러닝을 하기 위해서는 숫자로만 이루어져 있어야함

In [15]:
df_BND_M20_ex[:2]

Unnamed: 0,SICK_SYM1,TOT_PRSC_DD_CNT,MCARE_RSLT_TYPE,EDC_ADD_RT,ED_RC_TOT_AMT,EDC_SBA,SICK_SYM2
0,L028,3,5.0,0.15,10590,3000,L02
1,J030,2,5.0,0.15,16520,4950,J03


In [16]:
target_list = df_BND_M20_ex['SICK_SYM2'].value_counts().index.to_list()
target_list

['J20',
 'I10',
 'J03',
 'M54',
 'J06',
 'M17',
 'J30',
 'E11',
 'J01',
 'J02',
 'K29',
 'J00',
 'L23',
 'H10',
 'J04',
 'J45',
 'M51',
 'M75',
 'B35',
 'J32',
 'H66',
 'N18',
 'A09',
 'K21',
 'M79',
 'S33',
 'J21',
 'J40',
 'H04',
 'H52',
 'M48',
 'H65',
 'L50',
 'M47',
 'I11',
 'M13',
 'M50',
 'N30',
 'N40',
 'J31',
 'L02',
 'K25',
 'H25',
 'M25',
 'K59',
 'E78',
 'H60',
 'M65',
 'S93',
 'L30',
 'J18',
 'H00',
 'K52',
 'S63',
 'S61',
 'H16',
 'S83',
 'K58',
 'J34',
 'M19',
 'I20',
 'M77',
 'L20',
 'S01',
 'I84',
 'S13',
 'I63',
 'M81',
 'L03',
 'E14',
 'M72',
 'B02',
 'L24',
 'H81',
 'M15',
 'S43',
 'J22',
 'L21',
 'J36',
 'E03',
 'H40',
 'R10',
 'K27',
 'S52',
 'H35',
 'C22',
 'S62',
 'E05',
 'N34',
 'L84',
 'L04',
 'G44',
 'S23',
 'J35',
 'C16',
 'K76',
 'K12',
 'H11',
 'R51',
 'M10',
 'N41',
 'R07',
 'B30',
 'L25',
 'I50',
 'A08',
 'R42',
 'L08',
 'N39',
 'R50',
 'S92',
 'B07',
 'L40',
 'K30',
 'K26',
 'C90',
 'G43',
 'E04',
 'L29',
 'M23',
 'H36',
 'S82',
 'M53',
 'G47',
 'B00',


In [17]:
target_list.index('L02'), target_list.index('J03') #확인용

(40, 2)

In [18]:
def applyIndexNumber(sick_sym_3):
    indexNumber = target_list.index(sick_sym_3)
    return indexNumber

In [19]:
df_BND_M20_ex['SICK_SYM3'] = df_BND_M20_ex['SICK_SYM2'].apply(applyIndexNumber)
df_BND_M20_ex[:2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_BND_M20_ex['SICK_SYM3'] = df_BND_M20_ex['SICK_SYM2'].apply(applyIndexNumber)


Unnamed: 0,SICK_SYM1,TOT_PRSC_DD_CNT,MCARE_RSLT_TYPE,EDC_ADD_RT,ED_RC_TOT_AMT,EDC_SBA,SICK_SYM2,SICK_SYM3
0,L028,3,5.0,0.15,10590,3000,L02,40
1,J030,2,5.0,0.15,16520,4950,J03,2


#### 정형화 단계

In [20]:
from sklearn.model_selection import train_test_split
target = df_BND_M20_ex['SICK_SYM1']
features = df_BND_M20_ex[['TOT_PRSC_DD_CNT', 'MCARE_RSLT_TYPE', 'EDC_ADD_RT', 'ED_RC_TOT_AMT', 'EDC_SBA']]

In [21]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((92072, 5), (92072,), (30691, 5), (30691,))

#### 모델학습

In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(features_train, target_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
model.coef_, model.intercept_

(array([[-3.96447188e-09, -4.56952325e-10, -6.38878853e-11,
         -1.04443720e-05, -2.72269837e-06],
        [-4.13932455e-09, -7.24505011e-10, -7.37941030e-11,
         -1.14743389e-05, -3.02323768e-06],
        [-4.13859290e-09, -7.24366489e-10, -7.37800251e-11,
         -1.15462532e-05, -3.04604584e-06],
        ...,
        [-4.19438975e-09, -6.61163228e-10, -6.24128448e-11,
         -9.41603262e-06, -2.31071102e-06],
        [-4.12308210e-09, -7.07779623e-10, -7.12793246e-11,
         -1.13930227e-05, -3.06597969e-06],
        [-4.13147830e-09, -5.44103421e-10, -4.64964565e-11,
         -8.93755551e-06, -2.31244380e-06]]),
 array([-3.82693456e-10, -4.48886334e-10, -4.48803091e-10, ...,
        -3.83929577e-10, -4.32113584e-10, -2.66527030e-10]))

#### 예측

In [24]:
df_BND_M20_ex[10:15]

Unnamed: 0,SICK_SYM1,TOT_PRSC_DD_CNT,MCARE_RSLT_TYPE,EDC_ADD_RT,ED_RC_TOT_AMT,EDC_SBA,SICK_SYM2,SICK_SYM3
11,J00,3,1.0,0.25,12240,6120,J00,11
12,J010,4,5.0,0.15,8060,3000,J01,8
13,J304,1,5.0,0.15,31050,9310,J30,6
14,J304,5,5.0,0.15,43430,15000,J30,6
15,J303,0,5.0,0.15,9950,3000,J30,6


In [25]:
model.predict(features_train[10:15])

array(['I639', 'I69300', 'I69300', 'C900', 'I69300'], dtype=object)

In [26]:
model.predict_proba(features_train[10:15])

array([[0.00029278, 0.00029032, 0.00029015, ..., 0.00029535, 0.00029045,
        0.00029638],
       [0.00028203, 0.00027894, 0.00027872, ..., 0.00028525, 0.00027913,
        0.00028659],
       [0.00016088, 0.0001553 , 0.00015492, ..., 0.00016683, 0.00015564,
        0.00016936],
       [0.00026322, 0.00025924, 0.00025897, ..., 0.00026728, 0.00025953,
        0.00026912],
       [0.0002559 , 0.00025161, 0.00025131, ..., 0.00026038, 0.00025187,
        0.00026226]])

#### 평가

In [27]:
target_train_predict = model.predict(features_train)
target_train_predict.shape # target_train.shape 동일

(92072,)

In [28]:
from sklearn.metrics import accuracy_score # 정확도

In [29]:
accuracy_score(target_train, target_train_predict) # 교내 시험

0.01325049960900165

In [30]:
target_test_perdict = model.predict(features_test)
target_test_perdict.shape # target_test.shape 동일

(30691,)

In [31]:
accuracy_score(target_test, target_test_perdict) # 교외 시험

0.013098302433938287

In [32]:
from sklearn.metrics import classification_report

In [33]:
print(classification_report(target_train, target_train_predict))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         A04       0.00      0.00      0.00         5
        A044       0.00      0.00      0.00         1
        A048       0.00      0.00      0.00         1
        A049       0.00      0.00      0.00       104
         A05       0.00      0.00      0.00         4
        A058       0.00      0.00      0.00         1
        A059       0.00      0.00      0.00        15
        A062       0.00      0.00      0.00         1
         A08       0.00      0.00      0.00         9
        A083       0.00      0.00      0.00        25
        A084       0.00      0.00      0.00       101
        A085       0.00      0.00      0.00        21
         A09       0.00      0.00      0.00       268
        A090       0.00      0.00      0.00       273
        A099       0.00      0.00      0.00       320
        A419       0.00      0.00      0.00         4
         A49       0.00      0.00      0.00         1
        A491       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
print(classification_report(target_test, target_test_perdict))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        A048       0.00      0.00      0.00         1
        A049       0.00      0.00      0.00        33
        A059       0.00      0.00      0.00         7
        A069       0.00      0.00      0.00         1
         A08       0.00      0.00      0.00         4
        A083       0.00      0.00      0.00         5
        A084       0.00      0.00      0.00        33
        A085       0.00      0.00      0.00         4
         A09       0.00      0.00      0.00        92
        A090       0.00      0.00      0.00       108
        A099       0.00      0.00      0.00       100
        A488       0.00      0.00      0.00         1
        A491       0.00      0.00      0.00         1
        A493       0.00      0.00      0.00         2
        A499       0.00      0.00      0.00         1
         A59       0.00      0.00      0.00         3
        A690       0.00      0.00      0.00         2
        A878       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


#### 오차 행렬

In [35]:
from sklearn.metrics import confusion_matrix ,precision_score , recall_score , f1_score

In [36]:
confusion_matrix(target_train, target_train_predict)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [37]:
precision_score(target_train,target_train_predict)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [None]:
recall_score(target_train,target_train_predict)