# Word Embeddings: numerical representations to classify clinical text

### Jocelyn Dunstan y Fabián Villena, CIMT, Uchile

Modules loading

In [2]:
import numpy as np #linear algebra
import nltk #natural language processing tools
import gensim #neural word embedding training
import re #regular expressions
import logging #verbosity (code telling you what's going on)
import csv #data loading
import sklearn.ensemble #classifier
import sklearn.metrics #performance assessment
import sklearn.model_selection #train-test split

Downloading extra data for nltk

In [4]:
nltk.download('stopwords') # connectors and other non-semantic rich expressions
nltk.download('punkt') # sentence tokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelyn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jocelyn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Adding some verbosity to the embedding training session, i.e., the coding telling you what is doing

In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Loading spanish stopwords list from nltk

In [7]:
stopwords = nltk.corpus.stopwords.words('spanish')

In [8]:
set(stopwords)

{'a',
 'al',
 'algo',
 'algunas',
 'algunos',
 'ante',
 'antes',
 'como',
 'con',
 'contra',
 'cual',
 'cuando',
 'de',
 'del',
 'desde',
 'donde',
 'durante',
 'e',
 'el',
 'ella',
 'ellas',
 'ellos',
 'en',
 'entre',
 'era',
 'erais',
 'eran',
 'eras',
 'eres',
 'es',
 'esa',
 'esas',
 'ese',
 'eso',
 'esos',
 'esta',
 'estaba',
 'estabais',
 'estaban',
 'estabas',
 'estad',
 'estada',
 'estadas',
 'estado',
 'estados',
 'estamos',
 'estando',
 'estar',
 'estaremos',
 'estará',
 'estarán',
 'estarás',
 'estaré',
 'estaréis',
 'estaría',
 'estaríais',
 'estaríamos',
 'estarían',
 'estarías',
 'estas',
 'este',
 'estemos',
 'esto',
 'estos',
 'estoy',
 'estuve',
 'estuviera',
 'estuvierais',
 'estuvieran',
 'estuvieras',
 'estuvieron',
 'estuviese',
 'estuvieseis',
 'estuviesen',
 'estuvieses',
 'estuvimos',
 'estuviste',
 'estuvisteis',
 'estuviéramos',
 'estuviésemos',
 'estuvo',
 'está',
 'estábamos',
 'estáis',
 'están',
 'estás',
 'esté',
 'estéis',
 'estén',
 'estés',
 'fue',
 'f

Some self-defined functions

In [9]:
def normalizer(text): #normalizes a given string to lowercase and changes all vowels to their base form
    text = text.lower() #string lowering
    text = re.sub(r'[^A-Za-zñáéíóú]', ' ', text) #replaces every punctuation with a space
    text = re.sub('á', 'a', text) #replaces special vowels to their base forms
    text = re.sub('é', 'e', text)
    text = re.sub('í', 'i', text)
    text = re.sub('ó', 'o', text)
    text = re.sub('ú', 'u', text)
    return text

In [10]:
def vectorizer(text, model): #returns a vector representation from a list of words and a given model
    vectors = []
    for i in text:
        try:
            vectors.append(model.wv[i])
        except:
            pass
    return(np.mean(vectors,axis=0))

## Word Embedding

Loading the corpus dataset and taking each line to an array of sentences. We are using the whole Aysen's waiting list as a corpus

In [11]:
sentences = []
with open('corpus.txt', encoding='utf-8') as file: #use utf-8 to preserve special characters
    for line in file:
        sentences.append((line.rstrip())) #use strip to remove \n (newline)

Taking every sentence and normalizing it

In [12]:
normalized_sentences = [normalizer(sentence) for sentence in sentences]

Tokenizing every sentence into words

In [13]:
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in normalized_sentences]

Removing stopwords

In [14]:
without_stopwords_sentences = []
for sentence in tokenized_sentences:
    without_stopwords_sentence = [word for word in sentence if word not in stopwords]
    without_stopwords_sentences.append(without_stopwords_sentence)

Training the neural word embeddings word2vec

In [15]:
model = gensim.models.Word2Vec(without_stopwords_sentences)

2019-06-03 17:32:07,875 : INFO : collecting all words and their counts
2019-06-03 17:32:07,878 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-03 17:32:07,895 : INFO : PROGRESS: at sentence #10000, processed 26393 words, keeping 2174 word types
2019-06-03 17:32:07,910 : INFO : PROGRESS: at sentence #20000, processed 53049 words, keeping 2795 word types
2019-06-03 17:32:07,930 : INFO : PROGRESS: at sentence #30000, processed 80100 words, keeping 3189 word types
2019-06-03 17:32:07,953 : INFO : PROGRESS: at sentence #40000, processed 107473 words, keeping 3528 word types
2019-06-03 17:32:07,973 : INFO : PROGRESS: at sentence #50000, processed 134970 words, keeping 3783 word types
2019-06-03 17:32:08,011 : INFO : PROGRESS: at sentence #60000, processed 162957 words, keeping 3986 word types
2019-06-03 17:32:08,037 : INFO : PROGRESS: at sentence #70000, processed 191225 words, keeping 4140 word types
2019-06-03 17:32:08,064 : INFO : PROGRESS: at sentence #

Retrieving the most similar words to a given word

In [16]:
model.wv.most_similar('cancer')

2019-06-03 17:32:26,294 : INFO : precomputing L2-norms of word weight vectors


[('avanzado', 0.8856569528579712),
 ('metastasis', 0.881154477596283),
 ('adenocarcinoma', 0.872443437576294),
 ('infiltrante', 0.8395851850509644),
 ('adenoma', 0.8390409350395203),
 ('cervix', 0.8359759449958801),
 ('fallecido', 0.8250153064727783),
 ('oseas', 0.8198772668838501),
 ('padre', 0.8187509775161743),
 ('significado', 0.8186618089675903)]

Test your own words!

## Classifier: Can we classify the medical specialty from the diagnosis?

Loading the raw dataset and extracting the features and labels

In [17]:
diagnostics = [] #classifier raw features
specialties = [] #classifier raw labels
with open('data.csv', encoding='utf-8') as file:
    data = csv.DictReader(file)
    for row in data:
        diagnostics.append(row['diagnostic'])
        specialties.append(row['specialty'])

Preprocessing

In [18]:
diagnostics_normalized = [normalizer(diagnostic) for diagnostic in diagnostics]

In [19]:
diagnostics_tokenized = [nltk.word_tokenize(diagnostic) for diagnostic in diagnostics_normalized]

In [20]:
diagnostics_wihout_stopwords = []
for diagnostic in diagnostics_tokenized:
    diagnostic_wihout_stopwords = [word for word in diagnostic if word not in stopwords]
    diagnostics_wihout_stopwords.append(diagnostic_wihout_stopwords)

Creating an empty matrix to store the encoded features

In [21]:
diagnostics_matrix = np.zeros((len(diagnostics_wihout_stopwords), len(model.wv['cancer'])))

Filling the matrix with the vectorized diagnostics

In [22]:
for i,diagnostic in enumerate(diagnostics_wihout_stopwords):
    vector = vectorizer(diagnostic,model)
    diagnostics_matrix[i,] = vector

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Creating an empty vector to store the encoded labels

In [23]:
specialties_vector = np.zeros((len(specialties), 1))

Filling the vector with the encoded specialties

In [24]:
for i,specialty in enumerate(specialties): #OFTALMOLOGIA is encoded as a 1.0 and TRAUMATOLOGIA as a 2.0
    if specialty == 'OFTALMOLOGIA':
        specialties_vector[i,] = 1
    else:
        specialties_vector[i,] = 2

Concatenating the encoded features ans labels

In [25]:
data_matrix = np.concatenate([diagnostics_matrix,specialties_vector], axis=1)

Removing NAs from the matrix

In [26]:
data_matrix_without_nan = data_matrix[~np.isnan(data_matrix).any(axis=1)]

Splitting the dataset into training and testing subsets

In [27]:
diagnostics_train, diagnostics_test, specialties_train, specialties_test = sklearn.model_selection.train_test_split(
    data_matrix_without_nan[:,:100],
    data_matrix_without_nan[:,100],
    test_size=0.33,
    random_state=42
)

Training a Random Forest Classifier

In [28]:
classifier = sklearn.ensemble.RandomForestClassifier()

In [29]:
classifier.fit(diagnostics_train,specialties_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Making predictions over the testing subset and measuring their performance

In [30]:
predictions = classifier.predict(diagnostics_test)
print(sklearn.metrics.classification_report(predictions, specialties_test))

             precision    recall  f1-score   support

        1.0       0.78      0.98      0.87      5613
        2.0       0.98      0.82      0.89      8542

avg / total       0.90      0.88      0.88     14155



- Precision = TP/(TP+FP) # accuracy of positive predictions 
- Recall = TP/(TP+FN) # sensitivity or true positive rate
- F1-score = 2 x (precision x recall)/(precision + recall)

## Manual testing of the classifier

In [50]:
def specialtyClassifier(diagnostic):
    try:
        stringNorm = normalizer(diagnostic)
        stringTokenized = nltk.word_tokenize(stringNorm)
        stringVec = vectorizer(stringTokenized,model)
        result = classifier.predict(stringVec.reshape(1, -1))[0]
        if result==1.0: 
            return('Oftalmología')
        if result==2.0: 
            return('Traumatología')
    except:
        return(np.nan)

In [51]:
specialtyClassifier('vicio de refracción')

'Oftalmología'

In [52]:
specialtyClassifier('cadera')

'Traumatología'

Try your own!

## fastText

This is another embedding model available in gensim that deals well with words that do not appear in the corpus. Let's see if it works well!

In [62]:
from gensim.models import FastText

In [63]:
modelF = FastText(without_stopwords_sentences, size=100)

2019-06-03 20:36:59,272 : INFO : collecting all words and their counts
2019-06-03 20:36:59,278 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-03 20:36:59,290 : INFO : PROGRESS: at sentence #10000, processed 26393 words, keeping 2174 word types
2019-06-03 20:36:59,304 : INFO : PROGRESS: at sentence #20000, processed 53049 words, keeping 2795 word types
2019-06-03 20:36:59,317 : INFO : PROGRESS: at sentence #30000, processed 80100 words, keeping 3189 word types
2019-06-03 20:36:59,331 : INFO : PROGRESS: at sentence #40000, processed 107473 words, keeping 3528 word types
2019-06-03 20:36:59,354 : INFO : PROGRESS: at sentence #50000, processed 134970 words, keeping 3783 word types
2019-06-03 20:36:59,369 : INFO : PROGRESS: at sentence #60000, processed 162957 words, keeping 3986 word types
2019-06-03 20:36:59,386 : INFO : PROGRESS: at sentence #70000, processed 191225 words, keeping 4140 word types
2019-06-03 20:36:59,427 : INFO : PROGRESS: at sentence #

We trained again the classifier using fastText embeddings

In [64]:
diagnostics_matrixF = np.zeros((len(diagnostics_wihout_stopwords), len(model.wv['cancer'])))

Filling the matrix with the vectorized diagnostics

In [65]:
for i,diagnostic in enumerate(diagnostics_wihout_stopwords):
    vector = vectorizer(diagnostic,modelF)
    diagnostics_matrixF[i,] = vector

Creating an empty vector to store the encoded labels

In [66]:
specialties_vectorF = np.zeros((len(specialties), 1))

Filling the vector with the encoded specialties

In [67]:
for i,specialty in enumerate(specialties): #OFTALMOLOGIA is encoded as a 1.0 and TRAUMATOLOGIA as a 2.0
    if specialty == 'OFTALMOLOGIA':
        specialties_vector[i,] = 1
    else:
        specialties_vector[i,] = 2

Concatenating the encoded features ans labels

In [68]:
data_matrixF = np.concatenate([diagnostics_matrixF,specialties_vector], axis=1)

Removing NAs from the matrix

In [69]:
data_matrix_without_nanF = data_matrixF[~np.isnan(data_matrixF).any(axis=1)]

Splitting the dataset into training and testing subsets

In [70]:
diagnostics_trainF, diagnostics_testF, specialties_trainF, specialties_testF = sklearn.model_selection.train_test_split(
    data_matrix_without_nanF[:,:100],
    data_matrix_without_nanF[:,100],
    test_size=0.33,
    random_state=42
)

Training a Random Forest Classifier

In [71]:
classifierF = sklearn.ensemble.RandomForestClassifier()

In [72]:
classifierF.fit(diagnostics_trainF,specialties_trainF)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Making predictions over the testing subset and measuring their performance

In [75]:
predictionsF = classifierF.predict(diagnostics_testF)
print(sklearn.metrics.classification_report(predictionsF, specialties_testF))

             precision    recall  f1-score   support

        1.0       0.79      0.98      0.87      5681
        2.0       0.98      0.82      0.89      8498

avg / total       0.90      0.88      0.89     14179



In [76]:
def specialtyClassifierF(diagnostic):
    try:
        stringNorm = normalizer(diagnostic)
        stringTokenized = nltk.word_tokenize(stringNorm)
        stringVec = vectorizer(stringTokenized,modelF)
        result = classifierF.predict(stringVec.reshape(1, -1))[0]
        if result==1.0: 
            return('Oftalmología')
        if result==2.0: 
            return('Traumatología')
    except:
        return(np.nan)

In [84]:
specialtyClassifier('fractura')

'Traumatología'

In [81]:
specialtyClassifierF('fracturaa')

'Traumatología'

In [82]:
modelF.wv.most_similar('cancer')

2019-06-03 20:42:57,481 : INFO : precomputing L2-norms of word weight vectors
2019-06-03 20:42:57,495 : INFO : precomputing L2-norms of ngram weight vectors


[('carcinoma', 0.7958588600158691),
 ('adenocarcinoma', 0.7862062454223633),
 ('av', 0.7563788890838623),
 ('cartilago', 0.7142010927200317),
 ('malagro', 0.7035268545150757),
 ('masa', 0.7021448612213135),
 ('campo', 0.7020975351333618),
 ('microadenoma', 0.6904839277267456),
 ('cabelludo', 0.6831401586532593),
 ('adenoma', 0.6822806000709534)]

This is an example that different embeddings have different performance, and one can do a qualitative and a quantitative evaluation of models and training corpus

This piece of code will not run in the cloud, but you can try more complete embeddings in your computer. 

For example, you can download the Spanish Billion Word Corpus from https://github.com/dccuchile/spanish-word-embeddings

model = gensim.models.KeyedVectors.load_word2vec_format('sbw_vectors.bin', binary=True)
model.wv.most_similar("cancer", topn=25