In [41]:
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM
from tensorflow.keras.layers import Dropout, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils.data_utils import pad_sequences
from keras.utils import to_categorical
import nltk
from nltk.corpus import stopwords
import re
import numpy as np

### used_data 필요한 피처들을 합친 데이터 프레임
- findings + 5개 label 피처 합침
---
merged_label_conditons 파일
14개의 피처를 5개의 관련 질환으로 재분류 한 라벨 데이터
- 폐 질환, 순환기 질환, 흉막 관련 질환, 관절 관련 질환, 발견 질환 없음
- label로 사용할 데이터 -> num_classes = 5
---
fidngs_and_impression 파일
흉부영상에서 발견한 특이점을 기록한 findings 피쳐가 있는 파일
- findings 피처를 텍스트 전처리한 다음 입력 값으로 사용
- input_value

In [14]:
base_dir = './data'
categorical_label_data_path = base_dir + '/merged_label_conditions.csv'
ehr_csv_data_path = base_dir + '/findings_and_impression.csv'

categorical_data = pd.read_csv(categorical_label_data_path)
text_data = pd.read_csv(ehr_csv_data_path)

finding_data = text_data[['study_id', 'Findings']]
used_data = categorical_data.merge(finding_data, how='inner')

used_data.info()

Unnamed: 0,study_id,LungConditions,CardiacAndCirculatoryConditions,PleuralConditions,SkeletalConditions,NormalConditions
0,50414267,0,0,0,0,1
1,53189527,0,0,0,0,1
2,53911762,0,0,0,0,1
3,56699142,0,0,0,0,1
4,57375967,1,0,0,0,0


1. 결측치 제거
2. 특수문자 제거
3. 불용어 제거

In [22]:
## 결측치 제거
used_data = used_data.dropna()

## 리스트 형식으로 변환
findings_lst = []
used_data['Findings'].astype('str')
for string in used_data['Findings']:
    findings_lst.append(string)

## 특수문자 제거
def re_text_preprocessing(lst):
     re_cleaned_list = []
     for text in lst:
          text = text.lower()
          words = text.split()
          refine_text = ""
          for word in words:
               refine_text += word + ' '
          re_cleaned_text = re.sub(r'[^\w\s]', '', refine_text)
          re_cleaned_list.append(re_cleaned_text)
     return re_cleaned_list

filtered_fidindgs = re_text_preprocessing(findings_lst)

nltk.download('stopwords')

## 불용어 제거
stop_words = set(stopwords.words('english'))
def remove_stopwords(texts):
    rm_sw_text = ""
    words = texts.split()
    filtered_text = ""
    for word in words:
        if word not in stop_words:
            filtered_text += word + ' '
    rm_sw_text += filtered_text
    return rm_sw_text

removed_sw_findings = []
for idx in range(len(filtered_fidindgs)):
    removed_sw_findings.append(remove_stopwords(filtered_fidindgs[idx]))

there is no focal consolidation pleural effusion or pneumothorax bilateral nodular opacities that most likely represent nipple shadows the cardiomediastinal silhouette is normal clips project over the left lung potentially within the breast the imaged upper abdomen is unremarkable chronic deformity of the posterior left sixth and seventh ribs are noted 
focal consolidation pleural effusion pneumothorax bilateral nodular opacities likely represent nipple shadows cardiomediastinal silhouette normal clips project left lung potentially within breast imaged upper abdomen unremarkable chronic deformity posterior left sixth seventh ribs noted 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 라벨 데이터 원 - 핫 인코딩

In [39]:
## 원핫 인코딩
labels = ['LungConditions', 'CardiacAndCirculatoryConditions', 'PleuralConditions', 
          'SkeletalConditions', 'NormalConditions']

multiple_labels = pd.get_dummies(used_data[labels])

<class 'pandas.core.frame.DataFrame'>
Index: 14509 entries, 0 to 22188
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   LungConditions                   14509 non-null  int64
 1   CardiacAndCirculatoryConditions  14509 non-null  int64
 2   PleuralConditions                14509 non-null  int64
 3   SkeletalConditions               14509 non-null  int64
 4   NormalConditions                 14509 non-null  int64
dtypes: int64(5)
memory usage: 680.1 KB


### dmis-lab biobert를 이용한 단어 토큰화 진행

In [33]:
from transformers import AutoTokenizer, TFAutoModel
from sklearn.model_selection import train_test_split

# 의료계에서 사용하는 토큰화 모델
tokenizer_model_name = 'dmis-lab/biobert-base-cased-v1.2'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
vocab_size = tokenizer.vocab_size

tokens_list = []
for text in removed_sw_findings:
    tokens = tokenizer.encode(text, max_length=512)
    tokens_list.append(tokens)

tokens_len = []
for text_len in removed_sw_findings:
    tokens_len.append(len(text_len))

print(max(tokens_len))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1469


입력값과 라벨 최종 생성 완료

In [40]:
max_seq = max(tokens_len)

tokens_list_padded = pad_sequences(tokens_list, padding='pre', maxlen=max_seq)
input_value = tokens_list_padded

label = multiple_labels

print(input_value.shape)
print(label.shape)

(14509, 1469)
(14509, 5)


In [43]:
seed = 42

input_train, input_test, label_train, label_test = train_test_split(input_value, label, test_size=0.2, random_state=seed)

print(input_train.shape, input_test.shape, label_train.shape, label_test.shape)

(11607, 1469) (2902, 1469) (11607, 5) (2902, 5)


In [44]:
class sub_lstm(tf.keras.Model):
    def __init__(self, vocab_size, embedding_units, lstm_units):
        super(sub_lstm, self).__init__()

        self.embedding_layer = Embedding(vocab_size, embedding_units)

        self.lstm_layer = LSTM(lstm_units)

        self.output_layer = Dense(1, activation='sigmoid')

    def call(self, inputs):

        inputs = tf.cast(inputs, dtype=tf.float32)

        x = self.embedding_layer(inputs)

        x = self.lstm_layer(x)

        outputs = self.output_layer(x)

        return outputs     