## Text Cleaner
Use regex and ntlk stopwords feature to clean up the text

In [35]:
import re
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
    
def cleanText(text):
    
    # substitute acronym by its more usual equivalent
    text = re.sub(r"\?\?", "?", text)
    text = re.sub("qu'", "que ", text)
    text = re.sub("n°", "numéro", text)
    text = re.sub(r"([^\s])\?", r"\1 ?", text) # stick question mark
    
    # prune out undesired tokens
    text = re.sub("les ", " ", text)
    text = re.sub("\+", " ", text)
    text = re.sub("(\"|\(|\))", "", text)        
    text = re.sub("-", " ", text)
    text = re.sub(r"([A-Za-zèéàê])('|’|®|™)", "", text)
    
    # mixed numbers+letters like: 6jours => jours (keep only the letters)
    text = re.sub(r"([0-9]+)([A-Za-zèéàêô]+)", r"\2", text)
    text = re.sub(r"([A-Za-zèéàêô]+)([0-9]+)", r"\2", text)
    
    ## remove puncuation
    text = text.translate(string.punctuation)
    
    ## convert words to lower case and split them
    text = text.lower().split()
    
    ## remove stop words
    stops = set(stopwords.words("french"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)    
    
    # dont stem as FastText can cope with morphological variability
    #text = text.split()
    #stemmer = SnowballStemmer('french')
    #stemmed_words = [stemmer.stem(word) for word in text]
    #text = " ".join(stemmed_words)

    if len(text) == 0:
        text = " "
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jacques\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
import pandas as pd
XTrain = pd.read_csv('../../data/POSOS/input_train.csv', sep=';')

pd.set_option("display.max_colwidth",300)
XTrain['question'] = XTrain['question'].map(lambda x: cleanText(x))

XTrain.to_csv('../../data/staging_data/clean_input_train.csv', index=None, sep=";")
