## Text Cleaner
Use custom regex and ntlk stopwords feature to clean up the text

In [111]:
import re
import nltk
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

stemmer = SnowballStemmer('french')

punctationRegEx = re.compile('(\.|\!|\?|:|;|,|\")')
numberRegEx = re.compile("([0-9]+)")
    
def cleanText(text, useStemming=False, useStopWords = True, removePunctuation=True):
    
    # lower case
    text = text.lower()
    
    # substitute acronym and repetitive characters by its more canonical equivalent token
    text = re.sub("qu'", "que ", text)
    text = re.sub("qu ", "que ", text)
    text = re.sub("n°", "numéro ", text)
    text = re.sub("bcp", "beaucoup", text)
    text = re.sub("s.v.p", "s'il vous plait", text)    
    text = re.sub("tt ", "tout ", text)    
    text = re.sub("\+", " et ", text)
    
    text = re.sub(r"([^\s])[?]", r"\1 ?", text) # sticky question mark: add a space
    text = re.sub(r"(\?|\.|!){2,5}", r"\1 ", text) # repertitive characters => keep one representative
    
    
    text = re.sub("", "", text)    
    
    # expand separators / special characters
    text = re.sub("®|™|°|\/|-|\*|•|=|\(|\)|%|\{|\}", " ", text)
    
    # remove the contractions: eg: l’, m’, ...
    text = re.sub("(\s[a-zA-Z][’|\'])", " ", text)    
    text = re.sub("^([a-zA-Z][’|\'])", "", text)    
    
    # remove numbers
    text = numberRegEx.sub("", text)
        
    ## remove puncuation
    if removePunctuation:
        text = punctationRegEx.sub(" ", text)
    
    # convert words to lower case and split them    
    if useStopWords:
        text = text.split()
        text = [w for w in text if not w in stopWords]
        text = " ".join(text)    
    
    if useStemming:
        text = text.split()        
        stemmedWords = [stemmer.stem(word) for word in text]
        text = " ".join(stemmedWords)

    # compact successive space character
    text = re.sub("([\s]+)", " ", text)
    
    if len(text) == 0:
        text = " "
        
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jacques\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [112]:
import pandas as pd

def cleanTexts(intputFileName, outputFileName, excludeStopWords, removePunctuation, useStemming):
    
    stopWords = set(stopwords.words("french"))
    
    if not excludeStopWords is None:
        for word in excludeStopWords:
            stopWords.remove(word)
    
    XTrain = pd.read_csv(intputFileName, sep=';')
    XTrain['question'] = XTrain['question'].map(lambda text: cleanText(text, useStemming, removePunctuation))
    XTrain.to_csv(outputFileName, index=None, sep=";")

In [113]:
cleanTexts(
    intputFileName = '../../data/POSOS/input_train.csv',
    outputFileName = '../../data/staging_data/stemmed_clean_input_train.csv',
    excludeStopWords = ['ou', 'et', 'avec'], # associative pronuouns are required later for text statistic
    removePunctuation=False,
    useStemming = True)

cleanTexts(
    intputFileName = '../../data/POSOS/input_train.csv',
    outputFileName = '../../data/staging_data/clean_input_train.csv',
    excludeStopWords = None,
    removePunctuation=False,
    useStemming = False)