In [1]:
import nltk
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
import re
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
nltk.download('punkt')

def preprocess(text):
    text = text.lower() # Lowercase
    text = re.sub(r'[^\w\s]',' ',text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text) # Remove extra spaces
    translator = str.maketrans('', '', '_%')
    text = text.translate(translator)
    return text.strip()
def lemmertize(texts):
   #texts input type: list of string
   wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
   lemmertize_texts = []
   
   for text in texts:
    words = preprocess(text).split(' ')
    lemmertize_texts.append(' '.join([wordnet_lemmatizer.lemmatize(word,pos ='v') for word in words]))

   #return lemmertized texts
   return lemmertize_texts
class Topic_Allocate():
  def __init__(self):
    self = self

  def cbow_fit (self, text_data, window_size = 4):
    texts = text_data
    #split into words
    texts = [text.split() for text in texts]

    #embeddind words
    word2vec = Word2Vec(texts, min_count = 1, window =  window_size, size = self.vector_size)

    # create dictionar
    self.dictionary = sorted(list(word2vec.wv.vocab))
    self.w2v = word2vec.wv

  def cbow_w2v(self, word):
    try:
      return self.w2v[word]
    except:
      return np.zeros(self.vector_size)  

  def doc2vec (self, text_data, window_size = 4, vector_size = 200, segment_size = 10, data_enrichment = 1, fit = False):
      
    self.segment_size = segment_size
    #lemmertize texts 
    texts = lemmertize(text_data)
    
    #lean vocabulary
    if fit:
      self.vector_size = vector_size
      self.cbow_fit(texts, window_size)

    #calculate tf_idf
    vectorizer = TfidfVectorizer(token_pattern= r'([a-zA-Z0-9µl½¼ménièreºfü]{1,})')
    vector = vectorizer.fit_transform(texts)
    
    #transform texts into matrixs
    # ts2vec = np.zeros((len(texts), text_matrix_size, self.vector_size))
    doc2vec = []
    for idx,text in enumerate(texts):
      #get vocab in text and sort alphabetically
      words = sorted(list(set(text.split())))
      words = np.array(words, dtype = type('a'))
      
      #cbow_matrix with each row correspond to each word in cbow vector form
      cbow_matrix = np.array([self.cbow_w2v(word) for word in words])

      #calculate tf_idf 
      text_vector = np.array(vectorizer.transform([text]).todense().tolist()[0])
     
      
      #remove zero entries
      text_vector  = text_vector[text_vector != 0]
  
      #combine tf_idf with cbow by multiply each cbow vector by its tf_idf
      
      cbow_tfidf_matrix = np.diag(text_vector) @ cbow_matrix
      #remove zero rows
      cbow_tfidf_matrix = cbow_tfidf_matrix[np.any(cbow_tfidf_matrix, axis = 1)]
  
      #compress words into segments 
      n = cbow_tfidf_matrix.shape[0]
      if data_enrichment > segment_size:
        data_enrichment = 1
        print('data_enrichment cannot be greater than segment_size')
      
      if n == 0:
        t2v = np.zeros((1,self.vector_size))
      elif n <= segment_size:
        t2v = np.mean(cbow_tfidf_matrix[ : n], axis = 0).reshape(1, self.vector_size)
      else:
        step = int(segment_size / data_enrichment)
        end = n - segment_size
        t2v = np.vstack([np.mean(cbow_tfidf_matrix[i : i + segment_size], axis = 0) for i in range(0, end, step)])
      
        #adjust rows remaining at the end of the matrix 
        if (n % segment_size) != 0:
          t2v = np.vstack((t2v, np.mean(cbow_tfidf_matrix[n - (n % segment_size) : n], axis = 0)))
      doc2vec.append(t2v)
     
    return doc2vec


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Medical Notes Classification

Medical notes is an useful information source for patient data extraction. Notes classification is also an important task in Medical NLP domain. There are many techniques to solve this problem ranging from traditional method (Logistic Regression, SVM,...) to the state-of-the-art models (Transformer).

The below code block is the baseline model for a text classification problem in medical domain.

* Input: the corpus of medical transcriptions.
* Output: the type of each notes.

In this problem, we try to classify five labels:
* Surgery
* Consult - History and Phy.
* Cardiovascular / Pulmonary
* Orthopedic
* Others

The train-test split was also defined, please don't change our split.

Metric to evaluate: `f1_macro`

# Baseline Model Result


0.3729330560342061

                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.35      0.39      0.37       148
    Consult - History and Phy.       0.32      0.06      0.10       207
                    Orthopedic       0.39      0.14      0.21       142
                         Other       0.66      0.74      0.70      1055
                       Surgery       0.43      0.57      0.49       435

                      accuracy                           0.56      1987
                     macro avg       0.43      0.38      0.37      1987
                  weighted avg       0.54      0.56      0.53      1987

# Library & Data Loading

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics


os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


### PLEASE DON'T CHANGE ANYTHING IN THIS SECTION ###
DATA = "https://github.com/socd06/private_nlp/raw/master/data/mtsamples.csv"

filtered_labels = [
    "Surgery",
    "Consult - History and Phy.",
    "Cardiovascular / Pulmonary",
    "Orthopedic",
]
data = pd.read_csv(DATA, usecols=['medical_specialty', 'transcription']).dropna()
data.columns = ['labels', 'text']
data['labels'] = [i.strip() if (i.strip() in filtered_labels) else 'Other' for i in data.labels.to_list()]
train, test = train_test_split(data, test_size=0.4, stratify=data.labels, random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
### END ###

# Preprocessing

# My Model


In [3]:
#encode labels 
le = LabelEncoder()
train['labels'] = le.fit_transform(train.labels)
test['labels'] = le.transform(test.labels)

### Encode Text


In [4]:
#create model
model = Topic_Allocate()

In [5]:
vector_size = 500
segment_size = 20
data_enrichment = 3
#encode texts into matrix
X_train = np.asarray(model.doc2vec(train['text'], vector_size = vector_size, segment_size = segment_size, data_enrichment = data_enrichment, fit = True))
X_test = np.asarray(model.doc2vec(test['text'], vector_size = vector_size, segment_size = segment_size, data_enrichment = data_enrichment))


  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


In [6]:
# onehot labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

y_train = train['labels']

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
Y_train = onehot_encoder.fit_transform(np.array(y_train).reshape(-1, 1))
Y_test = onehot_encoder.transform(np.array(test['labels']).reshape(-1,1))

# invert first example
inverted = le.inverse_transform([np.argmax(Y_train[0, :])])
print(inverted)

['Consult - History and Phy.']


### Train with LSTM

#### sequence -> vec

In [7]:
#%% import library
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
# from keras.metrics.cate

In [8]:
max_size = np.amax(np.array([x.shape[0] for x in X_train]))
def fill_zeros(x, Vector_size):
    try:
        missing = max_size - x.shape[0]
        fill_in = np.zeros((missing, Vector_size))
        return np.vstack((fill_in, x))
    except:
        return np.zeros((max_size, Vector_size))
func = lambda x: fill_zeros(x, 500)
X_train_lstm_s2v = np.array([func(x) for x in X_train])
X_test_lstm_s2v = np.array([func(x) for x in X_test])


## Pytorch

In [9]:
# pytorch mlp for regression
# from numpy import vstack
# from numpy import sqrt
# from pandas import read_csv
from sklearn.metrics import mean_squared_error
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor, nn
# from torch.optim import SGD
# from torch.nn import MSELoss
# from torch.nn.init import xavier_uniform_
from torch import optim
from sklearn.metrics import confusion_matrix
import math
import keras.backend as K
# import tensorflow as tf




In [10]:
from torch.utils import data
import torch
class talosix_dataset(data.Dataset):
    def __init__(self, text_data, text_label):
        super().__init__()
        #text_data is a np.ndarray
        self.size = text_data.shape[0]
        self.data = torch.from_numpy(np.double(text_data))
        self.label = torch.from_numpy(np.double(text_label))
    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        data_point = self.data[idx]
        data_label = self.label[idx]
        return data_point, data_label

train_set = talosix_dataset(X_train_lstm_s2v, Y_train)
test_set = talosix_dataset(X_test_lstm_s2v, Y_test)

In [11]:
import torch
print("Using torch", torch.__version__)
torch.manual_seed(42)  # Setting the seed
gpu_avail = torch.cuda.is_available()
print(f"Is the GPU available? {gpu_avail}")
device = torch.device("cuda") 

print("Device", device)

Using torch 1.9.0+cu111
Is the GPU available? True
Device cuda


In [12]:
from torch.autograd import Variable 

class LSTM(nn.Module):
    def __init__(self, num_classes, input_size, num_layers):
        super(LSTM, self).__init__()
        self.num_classes = num_classes #number of classes
        self.num_layers = num_layers   #number of layers
        self.input_size = input_size   #input size
        self.hidden_size = 50          #hidden state

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size = self.hidden_size,
                          num_layers=num_layers, batch_first=True, dropout = 0.2) #lstm

        self.lstm2 =  nn.LSTM(input_size = self.hidden_size, hidden_size = 50, batch_first = True, dropout = 0.2)
        self.fc1 = nn.Linear(50, num_classes) #fully connected last layer
    
    def forward(self, x):
        self.out1, (hn1, cn1) = self.lstm1(x) #lstm with input, hidden, and internal state
        self.out2, (hn2, cn2) = self.lstm2(self.out1)

        hn2 = hn2.view(-1, 50) #reshaping the data for Dense layer next
        out = self.relu(hn2)
        out = self.fc1(out) #first Dense
        out = self.softmax(out) #relu

        return out 

In [13]:
class f1_loss(nn.Module):
  def __init__(self, weight = None):
    super(f1_loss, self).__init__()
    self.weight = weight


  def forward(self, y_pred, y_true):

    tp = torch.sum(y_true*y_pred, dim=0)
    # tn = torch.sum((1-y_true)*(1-y_pred), dim=0)
    fp = torch.sum((1-y_true)*y_pred, dim=0)
    fn = torch.sum(y_true*(1-y_pred), dim=0)

    p = tp / (tp + fp + 1e-10)
    r = tp / (tp + fn + 1e-10)

    f1 = 2*p*r / (p+r+1e-10)
    f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1), f1)
    if self.weight != None:
      f1 = f1 * torch.Tensor(self.weight).cuda()

    return 1 - torch.mean(f1)

In [14]:
def train (num_epochs, model, loaders, loss_func, lr, wd = 0):
    model.train()

    total_step = len(loaders)

    for epoch in range(num_epochs):
        for i, (x, y) in enumerate(loaders):

            x, y = x.type(torch.float).to('cuda'), y.type(torch.float).to('cuda')
            
            out = model(x)
            loss = loss_func(out, y)

            # optimizer = optim.Adam(model.parameters(), lr = lr)
            optimizer = optim.Adam(model.parameters(), lr = lr/(2**(epoch//10)), weight_decay = wd)
            # optimizer = optim.Adam(model.parameters(), lr = lr/(epoch//10 + 1))
            # optimizer = optim.Adam(model.parameters(), lr = lr/((epoch+1)//math.sqrt( 0.6 * (epoch + 1)  )))


            # optimizer = optim.Adam(model.parameters())

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            if (i+1) % total_step == 0:
                Y_pred = model(torch.Tensor(X_train_lstm_s2v).to('cuda'))
                check1 = torch.argmax(Y_pred, dim = 1, keepdim= True).cpu()
                ytrain1 = np.argmax(Y_train, axis = 1)

                Y_pred = model(torch.Tensor(X_test_lstm_s2v).to('cuda'))
                check2 = torch.argmax(Y_pred, dim = 1, keepdim= True).cpu()
                ytrain2 = np.argmax(Y_test, axis = 1)

                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, f1: {:.4f}, val_f1: {:.4f}' 
                       .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(), metrics.f1_score(ytrain1, check1, average='macro'), metrics.f1_score(ytrain2, check2, average='macro')))
                pass
        
        pass
    
    
    pass

            
    

In [15]:
batch_size = 16

# Create data loaders.
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle = True)
test_dataloader = DataLoader(test_set, batch_size=batch_size)

In [16]:
loss_fn = f1_loss(weight = [0.32, 0.22, 0.32, 0.04, 0.1])

In [17]:
model = LSTM(5, 500, 1).cuda()
train(50, model, train_dataloader, loss_fn, 0.001)

  "num_layers={}".format(dropout, num_layers))


Epoch [1/50], Step [187/187], Loss: 0.9879, f1: 0.2187, val_f1: 0.2241
Epoch [2/50], Step [187/187], Loss: 0.9972, f1: 0.3067, val_f1: 0.3184
Epoch [3/50], Step [187/187], Loss: 0.9966, f1: 0.3000, val_f1: 0.3386
Epoch [4/50], Step [187/187], Loss: 0.9964, f1: 0.3278, val_f1: 0.3455
Epoch [5/50], Step [187/187], Loss: 0.9868, f1: 0.3134, val_f1: 0.3223
Epoch [6/50], Step [187/187], Loss: 0.9810, f1: 0.2512, val_f1: 0.2521
Epoch [7/50], Step [187/187], Loss: 0.9698, f1: 0.3207, val_f1: 0.3497
Epoch [8/50], Step [187/187], Loss: 0.9696, f1: 0.2471, val_f1: 0.2682
Epoch [9/50], Step [187/187], Loss: 0.9474, f1: 0.3850, val_f1: 0.4060
Epoch [10/50], Step [187/187], Loss: 0.9961, f1: 0.4252, val_f1: 0.4435
Epoch [11/50], Step [187/187], Loss: 0.9308, f1: 0.4291, val_f1: 0.4459
Epoch [12/50], Step [187/187], Loss: 0.9846, f1: 0.3939, val_f1: 0.4279
Epoch [13/50], Step [187/187], Loss: 0.9750, f1: 0.3776, val_f1: 0.3888
Epoch [14/50], Step [187/187], Loss: 0.9516, f1: 0.4574, val_f1: 0.4601
E

In [18]:
Y_pred = model(torch.Tensor(X_test_lstm_s2v).to('cuda'))
check = torch.argmax(Y_pred, dim = 1, keepdim= True).cpu()
ytrain = np.argmax(Y_test, axis = 1)
print(confusion_matrix(check, ytrain))
print(metrics.f1_score(ytrain, check, average='macro'))
print(metrics.classification_report(ytrain, check, target_names=list(le.classes_)))

[[ 60   2   0  38  32]
 [ 14 136  11 269   3]
 [  0   0  65  40  76]
 [ 36  69  51 486  93]
 [ 38   0  15 222 231]]
0.45800640497135675
                            precision    recall  f1-score   support

Cardiovascular / Pulmonary       0.45      0.41      0.43       148
Consult - History and Phy.       0.31      0.66      0.43       207
                Orthopedic       0.36      0.46      0.40       142
                     Other       0.66      0.46      0.54      1055
                   Surgery       0.46      0.53      0.49       435

                  accuracy                           0.49      1987
                 macro avg       0.45      0.50      0.46      1987
              weighted avg       0.54      0.49      0.50      1987





In [19]:
model = LSTM(5, 500, 1).cuda()
train(35, model, train_dataloader, loss_fn, 0.001, 2e-7)

  "num_layers={}".format(dropout, num_layers))


Epoch [1/35], Step [187/187], Loss: 0.9902, f1: 0.2094, val_f1: 0.2117
Epoch [2/35], Step [187/187], Loss: 0.9821, f1: 0.3016, val_f1: 0.3025
Epoch [3/35], Step [187/187], Loss: 0.9873, f1: 0.1648, val_f1: 0.1582
Epoch [4/35], Step [187/187], Loss: 0.9978, f1: 0.3244, val_f1: 0.3440
Epoch [5/35], Step [187/187], Loss: 0.9753, f1: 0.1911, val_f1: 0.2243
Epoch [6/35], Step [187/187], Loss: 0.9743, f1: 0.3561, val_f1: 0.3564
Epoch [7/35], Step [187/187], Loss: 0.9718, f1: 0.3963, val_f1: 0.4112
Epoch [8/35], Step [187/187], Loss: 0.9861, f1: 0.3518, val_f1: 0.3794
Epoch [9/35], Step [187/187], Loss: 0.9765, f1: 0.3915, val_f1: 0.4139
Epoch [10/35], Step [187/187], Loss: 0.9734, f1: 0.3417, val_f1: 0.3579
Epoch [11/35], Step [187/187], Loss: 0.9999, f1: 0.4036, val_f1: 0.4211
Epoch [12/35], Step [187/187], Loss: 0.9568, f1: 0.3916, val_f1: 0.4103
Epoch [13/35], Step [187/187], Loss: 0.9512, f1: 0.3902, val_f1: 0.4196
Epoch [14/35], Step [187/187], Loss: 0.9920, f1: 0.4180, val_f1: 0.4425
E

In [20]:
Y_pred = model(torch.Tensor(X_test_lstm_s2v).to('cuda'))
check = torch.argmax(Y_pred, dim = 1, keepdim= True).cpu()
ytrain = np.argmax(Y_test, axis = 1)
print(confusion_matrix(check, ytrain))
print(metrics.f1_score(ytrain, check, average='macro'))
print(metrics.classification_report(ytrain, check, target_names=list(le.classes_)))

[[ 53   2   0  37  24]
 [ 11 114  13 200   2]
 [  0   0  61  46  52]
 [ 37  90  19 490  47]
 [ 47   1  49 282 310]]
0.46782354313026264
                            precision    recall  f1-score   support

Cardiovascular / Pulmonary       0.46      0.36      0.40       148
Consult - History and Phy.       0.34      0.55      0.42       207
                Orthopedic       0.38      0.43      0.41       142
                     Other       0.72      0.46      0.56      1055
                   Surgery       0.45      0.71      0.55       435

                  accuracy                           0.52      1987
                 macro avg       0.47      0.50      0.47      1987
              weighted avg       0.58      0.52      0.52      1987





In [21]:
!pip install torchinfo




In [22]:
from torchinfo import summary

summary(model, (2979, 121, 500))



Layer (type:depth-idx)                   Output Shape              Param #
LSTM                                     --                        --
├─LSTM: 1-1                              [2979, 121, 50]           110,400
├─LSTM: 1-2                              [2979, 121, 50]           20,400
├─ReLU: 1-3                              [2979, 50]                --
├─Linear: 1-4                            [2979, 5]                 255
├─Softmax: 1-5                           [2979, 5]                 --
Total params: 131,055
Trainable params: 131,055
Non-trainable params: 0
Total mult-adds (G): 47.15
Input size (MB): 720.92
Forward/backward pass size (MB): 288.49
Params size (MB): 0.52
Estimated Total Size (MB): 1009.93