## PREPROCESSING

### Imports

In [1]:
import pandas as pd
import numpy as np
import re, spacy, unidecode, warnings
from tqdm import tqdm, tqdm_notebook
from common_words import common_words

In [2]:
%%time
warnings.filterwarnings('ignore')
nlp = spacy.load("fr_core_news_md")
tqdm.pandas(tqdm_notebook)

CPU times: user 27.6 s, sys: 869 ms, total: 28.4 s
Wall time: 29.5 s


### Database loading

In [4]:
%%time
df = pd.read_pickle('../data/data.pkl')

CPU times: user 1.5 s, sys: 770 ms, total: 2.27 s
Wall time: 3.04 s


In [5]:
sample = df.sample(n = 10)

In [6]:
def preprocessing(text):
    tokens = re.sub('\W', ' ', text)
    doc = nlp(text)
    tokens = [unidecode.unidecode(str(token).strip().lower()) for token in doc if len(token) > 2 and str(token).strip() != '']
    tokens = [token for token in tokens if not nlp.vocab[token].is_stop and token not in common_words]
    tokens = [tokens[i] for i in range(len(tokens)) if tokens[i].isalpha() or tokens[i - 1] == 'article']
    return tokens

In [7]:
test = sample.CONTENU.progress_map(preprocessing)
len(test.values[0])

100%|██████████| 10/10 [00:05<00:00,  1.82it/s]


1426

In [11]:
df['tokens'] = df.CONTENU.progress_map(preprocessing)

100%|██████████| 67210/67210 [6:19:48<00:00,  2.95it/s]   


In [12]:
df.to_pickle('./text_tokens.pkl')

In [13]:
!ls

CLEANING.ipynb      PREPROCESSING.ipynb common_words.py     test.pkl
EDA.ipynb           [1m[36m__pycache__[m[m         doc2vec.ipynb       text_tokens.pkl


Unnamed: 0,ID,NUMERO_AFFAIRE,DATE_DEC,NATURE,SIEGE_APPEL,CONTENU,SCT,tokens
0,JURITEXT000027340904,13/00024,2013-04-16,ARRET,RENNES,\n ARRET No 13/ 129 \n du 16 Avril 2013 \n \...,,"[assistance, educative, andrea, kevin, attaque..."
1,JURITEXT000034005466,16/00553,2017-01-30,ARRET,BASSE_TERRE,\n \n VS-BR \n COUR D'APPEL DE BASSE-TERRE \...,,"[basse, terre, deferee, refere, pointe, pitre,..."
2,JURITEXT000033292077,16/00044,2016-10-21,ARRET,LIMOGES,\n \n No \n Dossier no 16/ 44 \n \n \n COUR ...,,"[limoges, stephane, limoges, colomer, limoges,..."
3,JURITEXT000006942297,2002/36774,2003-09-15,,PARIS,N° Répertoire Général : 02/36774 \n Sur appe...,"CONTRAT DE TRAVAIL, EXECUTION - Salaire - Paie...","[creteil, industrie, page, pages, cause, gille..."
4,JURITEXT000025232527,10/01738,2011-11-08,ARRET,ANGERS,\n COUR D'APPEL D'ANGERS Chambre Sociale \n...,,"[angers, mans, enregistree, jose, coulaines, m..."
...,...,...,...,...,...,...,...,...
67205,JURITEXT000028331961,11/03002,2013-12-10,ARRET,ANGERS,\n COUR D'APPEL d'ANGERS Chambre Sociale \n...,,"[angers, premiere, formation, paritaire, mans,..."
67206,JURITEXT000020268944,08/01403,2009-01-14,ARRET,BORDEAUX,\n Dossier n 08 / 01403 \n SB \n Arrêt no : ...,,"[christophe, correctionnelle, correctionnel, b..."
67207,JURITEXT000030302139,14/01848,2015-02-23,ARRET,BASSE_TERRE,\n \n \n VF-BR \n \n COUR D'APPEL DE BASSE-TE...,,"[basse, terre, deferee, securite, guadeloupe, ..."
67208,JURITEXT000037677084,18/129551,2018-11-29,ARRET,AIX_PROVENCE,COUR D'APPEL d'AIX-EN-PROVENCE [...] \n \n No...,,"[aix, provence, portalis, dbvb, plaidant, hann..."


In [16]:
data2 = pd.read_pickle('text_tokens.pkl')