## Mispelling Corrector
Fix the training text by substituing identified mispelled words by its fix suggestion: the word->fixed_vord maps are generated by this [notebook](mispelling_map_builder.ipynb) and materialized in staging_data csv files.
This notebook produces the [mispelling_fixed_clean_input_train.csv](../../data/staging_data/mispelling_fixed_clean_input_train.csv) file

In [1]:
# load first all mispelling fix map
import csv

def loadFixMap(fileName):
    reader = csv.reader(open(fileName, 'r'))
    fixMap = {}
    for row in reader:
       k, v = row
       fixMap[k] = v
    return fixMap

drugFixMap = loadFixMap('../../data/staging_data/mispelled_drug_names.csv')
ingredientFixMap = loadFixMap('../../data/staging_data/mispelled_ingredient_names.csv')
generalFixMap = loadFixMap('../../data/staging_data/mispelled_general_words.csv')

In [2]:
import pandas as pd
import numpy as np
import re

def applyFixMap(text, fixMap):
    for word, fixedWord in fixMap.items():
        text = re.sub(r"([^a-zA-Z0-9éèàôî]|^)" + word + "([^a-zA-Z0-9éèàôî]{1})", r"\g<1>" + fixedWord + "\g<2>", text)
    return text    

def fixText(text):
    result = applyFixMap(text, drugFixMap)
    result = applyFixMap(result, ingredientFixMap)
    result = applyFixMap(result, generalFixMap)
    #if result != text:
    #    print("text = " + text + "\nfix  = " + result + "\n--------------------------------------------------------\n")
    return result

def fixTexts(inputFileName, outputFileName):
    XTrain = pd.read_csv(inputFileName, sep=';')
    XTrain['question'] = XTrain['question'].map(lambda x : fixText(x))
    XTrain.to_csv(outputFileName, index=None)

In [3]:
fixTexts('../../data/staging_data/clean_input_train.csv', '../../data/staging_data/mispelling_fixed_clean_input_train.csv')
fixTexts('../../data/staging_data/stemmed_clean_input_train.csv', '../../data/staging_data/mispelling_fixed_stemmed_clean_input_train.csv')