In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer

import enchant
import re
import sys
import ipdb
import unidecode

from replacers import SpellingReplacer

In [2]:

X = pd.read_csv('DATA/input_train.csv', sep=";")
y = pd.read_csv('DATA/output_train.csv', sep=";")

features = X.columns
targets = y['intention'].unique

In [3]:
features

Index(['﻿ID', 'question'], dtype='object')

In [4]:
X.head()

Unnamed: 0,﻿ID,question
0,0,"bonjour, je m suis trompé de forum pour ma qu..."
1,1,est ce que le motilium me soulagera contre les...
2,2,mon médecin m'a prescrit adenyl. au 2ème cache...
3,3,Est-ce qu'il existe une forme adaptée aux enfa...
4,4,mon medecin me soigne pour une rhino pha...


In [5]:
#import the list of medicaments
medicaments = [x[:-1] for x in open("DATA/medicaments.txt", "r").readlines()]
symptomes = [x[:-1] for x in open("DATA/symptomes.txt", "r").readlines()]
maladies = [x[:-1] for x in open("DATA/maladies.txt", "r").readlines()]
posologies = ["mg", "ml", "ch", "g", "l"]
stop_words = stopwords.words('french')
replacer = SpellingReplacer(dict_name='fr_FR')
stemmer = FrenchStemmer()

In [6]:
# Indicateur d'avancement 
total = len(X['question'])

# List of all the preprocessing done 

Almost in order :
- remove **ponctuation**
- remove all **stop words**
- test if word is in list of **drugs**
- test if word is a **posology**
- test if word is a **disease**
- test if word is a **symptom**
- test if word is an **hour**
- test if word is an **age**
- test if word is **ordinal number**
- replace a maximum of **typo mistakes** with pyenchant : 
    from a list of suggestions, take the most probable one, 
    test if it is a stop word 
    replace the word only if the most probable suggestion is more than 1 letter 
- **Stemming** of words 
- remove **accents**
- remove **short words** of less than 2 letters

In [7]:
def clean(word):
    word = word.lower()
    heure_re = r'[0-9]{1,2}h[0-9]{,2}'
    ordinal_re = r'[0-9/]+(er|ème|ère|eme|ere)'
    age_re = r'[0-9]+(ans|an)'
    posologie_re = r'[0-9]+(mg|g|l|ml|ch)'
    if word in medicaments:
        word = "<MEDICAMENT>" #"medicament" 
        count['medicaments']+=1
    elif word in posologies: 
        word = "<POSOLOGIE>" #"posologie"
        count['posologies']+=1
    elif re.match(posologie_re,word):
        word = "<POSOLOGIE>" #"posologie"
        count['posologies']+=1
    elif word in maladies:
        word = "<MALADIE>" #"maladie"
        count['maladies']+=1
    elif word in symptomes:
        word = "<SYMPTOME>" #"symptome"
        count['symptomes']+=1
    elif re.match(heure_re,word):
        word = "<HEURE>" 
        count['heures']+=1
    elif re.match(age_re,word):
        word = "<AGE>" #age 
        count['ages']+=1
    elif re.match(ordinal_re,word):
        word = "<ORDINAL>" #nombre ordinal
        count['ordinal']+=1
    else:
        suggestion = replacer.replace(word)
        if suggestion != word: 
            count['corrections']+=1
            valid_suggestion = re.sub(r'[^\w]', ' ', suggestion).split()
            if valid_suggestion[len(valid_suggestion)-1] not in stop_words:
                word = valid_suggestion[len(valid_suggestion)-1].lower()
        word = stemmer.stem(word)
    return unidecode.unidecode(word)

def preprocess(row):
    sentence = re.sub(r'[^\w]', ' ', row['question'])
    word_list = [clean(word) for word in sentence.split() if (word not in stop_words and word)]
    preprocessed_sentence = ' '.join(word for word in word_list if( len(word)>2 and not re.match(r'[0-9]',word)) )
    if row['﻿ID']%100 == 0:
        print(" row {} / {}".format(row['﻿ID'], total),end='\r')
    return preprocessed_sentence

def output_format(row):
    return row[0]

In [8]:
count = {'medicaments' : 0,'posologies': 0,'maladies': 0,'symptomes' :0, "corrections": 0, "ages":0, "heures":0, "ordinal":0}
X_sub = X.iloc[:100]
X_clean = X.apply(preprocess, axis = 1)
print(count)

{'heures': 172, 'ages': 41, 'corrections': 6657, 'medicaments': 9942, 'maladies': 1833, 'symptomes': 311, 'ordinal': 184, 'posologies': 659}


In [9]:
X_clean[9]

'prend <MEDICAMENT> fin premi plaquet premi fois copain pilul donc bien efficac des debut <SYMPTOME> poitrin impress peu gross auss plus <MALADIE> bas vent dois minkiet simpl period adapt'

In [10]:
X_clean.to_csv('DATA/clean_data/cleaning_plus_stemming_input_train.csv', sep=';', header = ['question'])