In [1]:
import pandas as pd
import spacy

import enchant
import re
import sys
import ipdb
import unidecode

from replacers import SpellingReplacer
from nltk.stem.snowball import FrenchStemmer

In [2]:
# Data source path 
data_source_path = "../../DATA/"

In [3]:
X = pd.read_csv(data_source_path+'input_train.csv', sep=";")
y = pd.read_csv(data_source_path+'output_train.csv', sep=";")

features = X.columns
targets = y['intention'].unique

In [4]:
features

Index(['﻿ID', 'question'], dtype='object')

In [5]:
X.head()

Unnamed: 0,﻿ID,question
0,0,"bonjour, je m suis trompé de forum pour ma qu..."
1,1,est ce que le motilium me soulagera contre les...
2,2,mon médecin m'a prescrit adenyl. au 2ème cache...
3,3,Est-ce qu'il existe une forme adaptée aux enfa...
4,4,mon medecin me soigne pour une rhino pha...


In [6]:
#import the list of medicaments
medicaments = [x[:-1] for x in open(data_source_path+"../DATA/medicaments.txt", "r").readlines()]
symptomes = [x[:-1] for x in open(data_source_path+"symptomes.txt", "r").readlines()]
maladies = [x[:-1] for x in open(data_source_path+"maladies.txt", "r").readlines()]
posologies = ["mg", "ml", "ch", "g", "l"]
replacer = SpellingReplacer(dict_name='fr_FR')
french_stemmer = FrenchStemmer()
lemmatizer = spacy.load('fr')

stop_words = [ 'ce', 'ces', 'de', 'des', 'du', 'elle', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'mes', 'moi', 'mon', 'nos', 'notre', 'nous', 'on', 'sa', 'se', 'ses', 'son', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'm', 'n', 's', 't', 'y']
etre = ['été','étée','étées', 'étés', 'étant', 'étante','étants', 'étantes','suis', 'es','est', 'sommes','êtes','sont', 'serai', 'seras', 'sera', 'serons',  'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent']
avoir = ['ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions',  'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent']

In [7]:
# Indicateur d'avancement 
total = len(X['question'])

# List of all the preprocessing done 

Almost in order :
- remove **ponctuation**
- remove all **stop words**
- test if word is in list of **drugs**
- test if word is a **posology**
- test if word is a **disease**
- test if word is a **symptom**
- test if word is an **hour**
- test if word is an **age**
- test if word is **ordinal number**
- replace a maximum of **typo mistakes** with pyenchant : 
    from a list of suggestions, take the most probable one, 
    test if it is a stop word 
    replace the word only if the most probable suggestion is more than 1 letter 
- **lemmatizing** of words 
- remove **accents**
- remove **short words** of less than 2 letters

In [8]:
def clean(word):
    word = word.lower()
    heure_re = r'[0-9]{1,2}h[0-9]{,2}$'
    ordinal_re = r'[0-9]+(er|ème|ère|eme|ere)$'
    age_re = r'[0-9]+(ans|an)$'
    posologie_re = r'[0-9]+(mg|g|l|ml|ch|milligramme|milligrammes|milligramm)$'
    if word in etre: 
        word = 'être'
    elif word in avoir:
        word = 'avoir'
    elif word == '+':
        word = 'plus'
    elif word == '=':
        word = 'égal'    
    elif word in medicaments:
        word = "médicament" #"<MEDICAMENT>" 
        count['medicaments']+=1
    elif word in posologies: 
        word = "composition" #"<POSOLOGIE>"
        count['doses']+=1
    elif re.match(posologie_re,word):
        word = "composition" #"<COMPOSITION>" 
        count['doses']+=1
    elif word in maladies:
        word = "maladie" #"<MALADIE>"
        count['maladies']+=1
    elif word in symptomes:
        word = "symptôme" #"<SYMPTOME>" 
        count['symptomes']+=1
    elif re.match(heure_re,word):
        word = "heure" #"<HEURE>" 
        count['heures']+=1
    elif word == 'DCI':
        word = "dénomination commune internationale"
    elif re.match(age_re,word):
        word = "âge" #"<AGE>" 
        count['ages']+=1
    elif re.match(ordinal_re,word):
        word = "ordinal" #"<ORDINAL>"
        count['ordinal']+=1
    else:
        suggestion = replacer.replace(word)
        if suggestion != word: 
            count['corrections']+=1
            valid_suggestion = re.sub(r'[^\w]', ' ', suggestion).split()
            if valid_suggestion[len(valid_suggestion)-1] not in stop_words:
                word = valid_suggestion[len(valid_suggestion)-1].lower()
    word = french_stemmer.stem(word)
    #return unidecode.unidecode(word)
    return word
    
def preprocess(row):
    sentence = ''.join([re.sub(r'[^\w\+=]', ' ', word) for word in row['question']]) 
    word_list = [clean(word) for word in sentence.split() if (word not in stop_words)]
    #lemma_list = [unidecode.unidecode(word.lemma_) for word in lemmatizer(' '.join(word for word in word_list))]
    preprocessed_sentence = ' '.join(word for word in word_list if(not re.match( r'^[0-9]*$',word)) )
    
    if row['﻿ID']%100 == 0:
        print(" row {} / {}".format(row['﻿ID'], total),end='\r')
    return preprocessed_sentence

def output_format(row):
    return row[0]

In [9]:
count = {'medicaments' : 0,'doses': 0,'maladies': 0,'symptomes' :0, "corrections": 0, "ages":0, "heures":0, "ordinal":0}
X_sub = X.iloc[:10]
X_clean = X.apply(preprocess, axis = 1)
print(count)

{'heures': 168, 'doses': 648, 'corrections': 6686, 'symptomes': 311, 'ages': 40, 'ordinal': 180, 'maladies': 1830, 'medicaments': 9937}


In [10]:
X_clean[2]

'médecin médic prescr médic au ordinal cachet malad têt terribl et au ordinal malad symptôm froid chaleur intens dan têt symptôm fourmill dan levr supérieur difficult à respir des arrêt médic tous le symptôm avoir disparu cel être déjà arriv à quelqu'

# Do the same for testing

In [11]:
X_test = pd.read_csv(data_source_path+'input_test.csv', sep=";")
total = len(X_test['question']) + len(X['question'])
count = {'medicaments' : 0,'doses': 0,'maladies': 0,'symptomes' :0, "corrections": 0, "ages":0, "heures":0, "ordinal":0}
X_test_clean = X_test.apply(preprocess, axis = 1)
print(count)

{'heures': 39, 'doses': 172, 'corrections': 1589, 'symptomes': 70, 'ages': 10, 'ordinal': 33, 'maladies': 450, 'medicaments': 2515}


# Saving

In [12]:
strat = "stemming"

In [13]:
X_clean.to_csv(data_source_path+'input_train/'+strat+'/clean_input_to_train.csv', sep=';', header = ['question'])
X_test_clean.to_csv(data_source_path+'input_test/'+strat+'/clean_input_to_test.csv', sep=';', header = ['question'])