## PREPROCESSING

### Imports

In [5]:
import pandas as pd
import numpy as np
import re, spacy, unidecode, warnings
from tqdm import tqdm, tqdm_notebook
from common_words import common_words

In [6]:
%%time
warnings.filterwarnings('ignore')
nlp = spacy.load("fr_core_news_md")
tqdm.pandas(tqdm_notebook)

CPU times: user 27.1 s, sys: 1.25 s, total: 28.4 s
Wall time: 29.7 s


### Database loading

In [7]:
%%time
df = pd.read_pickle('../data/data.pkl')

CPU times: user 1.62 s, sys: 1.09 s, total: 2.71 s
Wall time: 3.94 s


### Preprocessing function

In [6]:
def preprocessing(text):
    tokens = re.sub('\W', ' ', text)
    doc = nlp(text)
    tokens = [unidecode.unidecode(str(token).strip().lower()) for token in doc if len(token) > 2 and str(token).strip() != '']
    tokens = [token for token in tokens if not nlp.vocab[token].is_stop and token not in common_words]
    tokens = [tokens[i] for i in range(len(tokens)) if tokens[i].isalpha() or tokens[i - 1] == 'article']
    return tokens

In [11]:
df['tokens'] = df.CONTENU.progress_map(preprocessing)

100%|██████████| 67210/67210 [6:19:48<00:00,  2.95it/s]   


In [12]:
df.to_pickle('./text_tokens.pkl')

In [2]:
def preprocessing_text(text):
    tokens = re.sub('\W', ' ', text)
    doc = nlp(text)
    tokens = [unidecode.unidecode(str(token).strip().lower()) for token in doc if str(token).strip() != '']
    tokens = [token for token in tokens if not nlp.vocab[token].is_stop]
    return tokens

In [9]:
test = preprocessing_text(df.CONTENU[0])

In [10]:
test

['arret',
 'no',
 '13/',
 '129',
 '16',
 'avril',
 '2013',
 'assistance',
 'educative',
 'andrea',
 'y',
 '...',
 'kevin',
 'y',
 '...',
 'date',
 'decision',
 'attaquee',
 ':',
 '27',
 'novembre',
 '2012',
 'decision',
 'attaquee',
 ':',
 'jugement',
 'juridiction',
 ':',
 'juge',
 'enfants',
 'brestcour',
 'appel',
 'rennes',
 'chambre',
 'speciale',
 'mineurs',
 'arret',
 'prononce',
 'mise',
 'disposition',
 'greffe',
 '16',
 'avril',
 '2013',
 'chambre',
 'speciale',
 'mineurs',
 'composition',
 'cour',
 ':',
 'debats',
 'audience',
 '22',
 'mars',
 '2013',
 'delibere',
 ':',
 'madame',
 'karine',
 'pontchateau',
 ',',
 'conseiller',
 'delegue',
 'protection',
 'enfance',
 'designee',
 'ordonnance',
 'president',
 'cour',
 'appel',
 'rennes',
 'date',
 '13',
 'juillet',
 '2012',
 ',',
 'presidant',
 'audience',
 ',',
 'mme',
 'raymonde',
 'letourneur',
 '-',
 'baffert',
 ',',
 'presidente',
 'chambre',
 ',',
 'm.',
 'pascal',
 'pedron',
 ',',
 'conseiller',
 ',',
 'ministere',
 'p

In [37]:
def preprocessing_text(text):
    text = re.sub('\W', ' ', text).lower().split()
    text = [unidecode.unidecode(word.strip()) for word in text if len(word) > 1]
    test = [word for word in text if not nlp.vocab[word].is_stop]
    return text

In [38]:
df['text_tokens'] = df.CONTENU.progress_map(preprocessing_text)

100%|██████████| 67210/67210 [09:28<00:00, 118.13it/s] 


In [41]:
df.text_tokens[0]

['arret',
 'no',
 '13',
 '129',
 'du',
 '16',
 'avril',
 '2013',
 'assistance',
 'educative',
 'andrea',
 'kevin',
 'date',
 'de',
 'la',
 'decision',
 'attaquee',
 '27',
 'novembre',
 '2012',
 'decision',
 'attaquee',
 'jugement',
 'juridiction',
 'juge',
 'des',
 'enfants',
 'de',
 'brestcour',
 'appel',
 'de',
 'rennes',
 'chambre',
 'speciale',
 'des',
 'mineurs',
 'arret',
 'prononce',
 'par',
 'mise',
 'disposition',
 'au',
 'greffe',
 'le',
 '16',
 'avril',
 '2013',
 'par',
 'la',
 'chambre',
 'speciale',
 'des',
 'mineurs',
 'composition',
 'de',
 'la',
 'cour',
 'lors',
 'des',
 'debats',
 'audience',
 'du',
 '22',
 'mars',
 '2013',
 'et',
 'du',
 'delibere',
 'madame',
 'karine',
 'pontchateau',
 'conseiller',
 'delegue',
 'la',
 'protection',
 'de',
 'enfance',
 'designee',
 'par',
 'ordonnance',
 'du',
 'premier',
 'president',
 'de',
 'la',
 'cour',
 'appel',
 'de',
 'rennes',
 'en',
 'date',
 'du',
 '13',
 'juillet',
 '2012',
 'presidant',
 'audience',
 'mme',
 'raymonde'

In [42]:
nlp.vocab['et'].is_stop

True