## Text vectorization with  FastText embedding model

Facebook company supplies a general word embedding model at [https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md)


In case of **out of vocabulary** situation with Facebook's model, there are 2 strategies:
*  if the word corresponds to **named entity** we consider as relevant w.r.t the classification, it's encoded into the vector of its hypernym/named entity which is "known" by the fasttext model
  * all identified drug names => **'médicament'**
  * all identified active ingrédient names => **'ingrédient'**
  * all numerical tokens (eg: 10, 50, ..)=> **'nombre'**
  * all time duration tokens (eg: 10 jours, ...) => **'durée'**
  * all time duration tokens (eg: 10 jours, ...) => **'temps'**
  * all time tokens (eg: 5h, ...) => **'temps'**
  * all weight tokens (eg: 5mg, ...) => **'poids'**
  * all volume tokens (eg: 5ml, ...) => **'volume'**
* the fall-back is to encode remaining unknown words into a random numerical vector  

we add some small vector variation based on the deterministic pseudo-random values initalized by the word hashing: it ensures that for isntance, drug names have the same random vector close to its named entity ("médicament")

This specific vectorization scheme is persisted aside the Facebook model at [../../pretrained_models/custom_embedding_model.txt](../../pretrained_models/custom_embedding_model.txt)

In [1]:
import pandas as pd
import numpy as np

nbDims = 300 #len(embeddings_index['oui'])

XTrain = pd.read_csv('../../data/staging_data/mispelling_fixed_clean_input_train.csv', sep=',')

In [2]:
import os, re, csv, math, codecs
from tqdm import tqdm

embeddings_index = {}
f = codecs.open('../../pretrained_models/fasttext/wiki.fr.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

1152466it [02:22, 8097.45it/s]


In [3]:
# compute the min/max values per dimension for the random vector
if False: # switch to True to trigger the min/max recomputation    
    import numpy as np
    maxLimit = 100.
    minValues = np.zeros(nbDims)
    maxValues = np.zeros(nbDims)

    init = True
    for vector in embeddings_index.values():
        for index, coord in enumerate(vector, start=0):
                if init == True:
                    ''' init values '''
                    minValues[index] = coord
                    if coord < maxLimit:
                        maxValues[index] = coord                        
                    init = False
                else:
                    if minValues[index] > coord:
                        minValues[index] = coord
                    if maxValues[index] < coord and coord < maxLimit: # skip outlier
                        maxValues[index] = coord 
    # save the embedding space boundaries into a file to avoir intensive recomputation
    pd.DataFrame(minValues).to_csv('../../data/staging_data/fasttext_minValues.txt', index=None, header=None)
    pd.DataFrame(maxValues).to_csv('../../data/staging_data/fasttext_maxValues.txt', index=None, header=None)                        

In [4]:
maxVectorValues = pd.read_csv('../../data/staging_data/fasttext_maxValues.txt', header=None)
maxVectorValues = maxVectorValues[0].values

minVectorValues = pd.read_csv('../../data/staging_data/fasttext_minValues.txt', header=None)
minVectorValues = minVectorValues[0].values

In [5]:
vocabulary_size = 10000
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(XTrain['question'])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
import re
from collections import Counter

def words(text):
    return re.findall(r'\w+', text.lower())

vectorAmplitudes = np.zeros(nbDims)
vectorAmplitudes = maxVectorValues - minVectorValues

def addNoise2Vector(word, vector, randomVectorAmplitudes):
    '''add noisy variation to the vector. The pseudo random generator is seeded with the word hash value
    to guarantee that similar words have the same random vector
    '''
    v = np.array(vector)
    randomVector = np.zeros(nbDims)
    random.seed(word)
    for dimIndex in range(0, nbDims):
        randomVector[dimIndex] = random.uniform(-randomVectorAmplitudes[dimIndex],randomVectorAmplitudes[dimIndex])
    v = v + randomVector
    return v.tolist()    

drugNames = Counter(words(open('../../data/staging_data/drug_names.txt').read()))
ingredientNames = Counter(words(open('../../data/staging_data/ingredient_names.txt').read()))

In [7]:
custom_embeddings = {}

ingredientCount = 0
drugCount = 0

import random
random.seed(1971)

# 0.25% random variation
randomVectorAmplitudes = vectorAmplitudes * 0.0025

for word in tokenizer.word_index.keys():                
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is None or len(embedding_vector) == 0:
        '''out of vocab'''
        if word in drugNames:
            custom_embeddings[word] = addNoise2Vector(word, embeddings_index.get('médicament'),randomVectorAmplitudes) 
            drugCount = drugCount + 1
        elif word in ingredientNames:
            custom_embeddings[word] = addNoise2Vector(word, embeddings_index.get('ingrédient'), randomVectorAmplitudes)
            ingredientCount = ingredientCount + 1
        else:
            '''random vector'''
            randomVector = np.zeros(nbDims)
            for dimIndex in range(0, nbDims):
                randomVector[dimIndex] = random.uniform(minVectorValues[dimIndex],maxVectorValues[dimIndex])            
            custom_embeddings[word] = randomVector.tolist()        

In [8]:
randomCount = len(custom_embeddings) - drugCount - ingredientCount
print("drugCount={0} ; ingredientCount={1} ; randomCount={2}".format(drugCount, ingredientCount, randomCount))

drugCount=673 ; ingredientCount=9 ; randomCount=796


Half of out of vocabulary words are not encoded randomly: drug and ingredient entities are represented as semantically close data points in the embedding destination space

In [9]:
# %load ../utils/serializer.py

import csv

def saveEmbeddingVector(vectors, fileName):
    ''' save a dict of numerical array'''
    with open(fileName, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in vectors.items():
            writer.writerow([key, ",".join([str(i) for i in value])])
    csv_file.close()

def loadEmbeddingVector(fileName):
    ''' load a dict of numerical array'''
    with open(fileName, 'r') as csv_file:
        reader = csv.reader(csv_file);
        temp_dict = dict(reader)
        myDict={k:list(map(lambda x: float(x), v.split(','))) for k,v in temp_dict.items()}    
        csv_file.close()
        return myDict 
    return None

In [10]:
# save the embedding extension to the file system
saveEmbeddingVector(custom_embeddings, '../../pretrained_models/fasttext_embedding_extension.txt')