# Word Embeddings and Classifiers

Modules loading

In [1]:
import numpy as np #linear algebra
import nltk #natural language processing tools
import gensim #neural word embedding training
import re #regular expressions
import logging #verbosity
import csv #data loading
import sklearn.ensemble #classifier
import sklearn.metrics #performance assessment
import sklearn.model_selection #train-test split



Downloading extra data for nltk

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

Adding some verbosity to the embedding training session

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Loading spanish stopwords list from nltk

In [4]:
stopwords = nltk.corpus.stopwords.words('spanish')

Some utility functions

In [5]:
def normalizer(text): #normalizes a given string to lowercase and changes all vowels to their base form
    text = text.lower() #string lowering
    text = re.sub(r'[^A-Za-zñáéíóú]', ' ', text) #replaces every punctuation with a space
    text = re.sub('á', 'a', text) #replaces special vowels to their base forms
    text = re.sub('é', 'e', text)
    text = re.sub('í', 'i', text)
    text = re.sub('ó', 'o', text)
    text = re.sub('ú', 'u', text)
    return text

In [29]:
def vectorizer(text, model): #returns a vector representation from a list of words and a given model
    vectors = []
    for i in text:
        try:
            vectors.append(model.wv[i])
        except:
            pass
    return(np.mean(vectors,axis=0))

## Word Embedding

Loading the corpus dataset and taking each line to an array of sentences

In [7]:
sentences = []
with open('corpus.txt', encoding='utf-8') as file: #use utf-8 to preserve special characters
    for line in file:
        sentences.append((line.rstrip())) #use strip to remove \n (newline)

Taking every sentence and normalizing it

In [8]:
normalized_sentences = [normalizer(sentence) for sentence in sentences]

Tokenizing every sentence into words

In [9]:
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in normalized_sentences]

Removing stopwords

In [10]:
without_stopwords_sentences = []
for sentence in tokenized_sentences:
    without_stopwords_sentence = [word for word in sentence if word not in stopwords]
    without_stopwords_sentences.append(without_stopwords_sentence)

Training the neural word embeddings model

In [11]:
model = gensim.models.Word2Vec(without_stopwords_sentences)

2019-06-03 12:53:26,507 : INFO : collecting all words and their counts
2019-06-03 12:53:26,510 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-03 12:53:26,554 : INFO : PROGRESS: at sentence #10000, processed 26393 words, keeping 2174 word types
2019-06-03 12:53:26,583 : INFO : PROGRESS: at sentence #20000, processed 53049 words, keeping 2795 word types
2019-06-03 12:53:26,607 : INFO : PROGRESS: at sentence #30000, processed 80100 words, keeping 3189 word types
2019-06-03 12:53:26,667 : INFO : PROGRESS: at sentence #40000, processed 107473 words, keeping 3528 word types
2019-06-03 12:53:26,700 : INFO : PROGRESS: at sentence #50000, processed 134970 words, keeping 3783 word types
2019-06-03 12:53:26,748 : INFO : PROGRESS: at sentence #60000, processed 162957 words, keeping 3986 word types
2019-06-03 12:53:26,778 : INFO : PROGRESS: at sentence #70000, processed 191225 words, keeping 4140 word types
2019-06-03 12:53:26,805 : INFO : PROGRESS: at sentence #

Retrieving the most similar words to a given word

In [12]:
model.wv.most_similar('cancer')

2019-06-03 12:53:34,272 : INFO : precomputing L2-norms of word weight vectors


[('metastasis', 0.888931393623352),
 ('avanzado', 0.871686577796936),
 ('adenoma', 0.8714392185211182),
 ('adenocarcinoma', 0.8571346998214722),
 ('infiltrante', 0.8364208340644836),
 ('cervix', 0.8173503279685974),
 ('indeterminado', 0.8064922094345093),
 ('oseas', 0.804007887840271),
 ('evacuador', 0.802149772644043),
 ('significado', 0.8013938069343567)]

## Classifier

Loading the raw dataset and extracting the features and labels

In [13]:
diagnostics = [] #classifier raw features
specialties = [] #classifier raw labels
with open('data.csv', encoding='utf-8') as file:
    data = csv.DictReader(file)
    for row in data:
        diagnostics.append(row['diagnostic'])
        specialties.append(row['specialty'])

Preprocessing

In [14]:
diagnostics_normalized = [normalizer(diagnostic) for diagnostic in diagnostics]

In [15]:
diagnostics_tokenized = [nltk.word_tokenize(diagnostic) for diagnostic in diagnostics_normalized]

In [16]:
diagnostics_wihout_stopwords = []
for diagnostic in diagnostics_tokenized:
    diagnostic_wihout_stopwords = [word for word in diagnostic if word not in stopwords]
    diagnostics_wihout_stopwords.append(diagnostic_wihout_stopwords)

Creating an empty matrix to store the encoded features

In [17]:
diagnostics_matrix = np.zeros((len(diagnostics_wihout_stopwords), len(model.wv['cancer'])))

Filling the matrix with the vectorized diagnostics

In [18]:
for i,diagnostic in enumerate(diagnostics_wihout_stopwords):
    vector = vectorizer(diagnostic,model)
    diagnostics_matrix[i,] = vector

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Creating an empty vector to store the encoded labels

In [19]:
specialties_vector = np.zeros((len(specialties), 1))

Filling the vector with the encoded specialties

In [30]:
for i,specialty in enumerate(specialties): #OFTALMOLOGIA is encoded as a 1.0 and TRAUMATOLOGIA as a 2.0
    if specialty == 'OFTALMOLOGIA':
        specialties_vector[i,] = 1
    else:
        specialties_vector[i,] = 2

Concatenating the encoded features ans labels

In [21]:
data_matrix = np.concatenate([diagnostics_matrix,specialties_vector], axis=1)

Removing NAs from the matrix

In [22]:
data_matrix_without_nan = data_matrix[~np.isnan(data_matrix).any(axis=1)]

Splitting the dataset into training and testing subsets

In [23]:
diagnostics_train, diagnostics_test, specialties_train, specialties_test = sklearn.model_selection.train_test_split(
    data_matrix_without_nan[:,:100],
    data_matrix_without_nan[:,100],
    test_size=0.33,
    random_state=42
)

Training a Random Forest Classifier

In [24]:
classifier = sklearn.ensemble.RandomForestClassifier()

In [25]:
classifier.fit(diagnostics_train,specialties_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Making predictions over the testing subset and measuring their performance

In [26]:
predictions = classifier.predict(diagnostics_test)
print(sklearn.metrics.classification_report(predictions, specialties_test))

              precision    recall  f1-score   support

         1.0       0.78      0.98      0.87      5614
         2.0       0.98      0.82      0.89      8541

   micro avg       0.88      0.88      0.88     14155
   macro avg       0.88      0.90      0.88     14155
weighted avg       0.90      0.88      0.88     14155



Manual testing of the classifier

In [27]:
def specialtyClassifier(diagnostic):
    try:
        stringNorm = normalizer(diagnostic)
        stringTokenized = nltk.word_tokenize(stringNorm)
        stringVec = vectorizer(stringTokenized,model)
        result = classifier.predict(stringVec.reshape(1, -1))[0]
        return(result)
    except:
        return(np.nan)

In [28]:
specialtyClassifier('vicio de refracción')

1.0