# Text Preprocessing

generate integer-indexed sentences, pos-tags and named entity tags, dictionaries for converting, etc, and save as `npy` binaries.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers.wrappers import Bidirectional
from keras.layers import Input, LSTM, Dropout, Embedding,Dense
from keras_contrib.layers import CRF
import json
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 

Using TensorFlow backend.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sujiv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def from_ad_to_dataframe(line, nb_line):
    """Fonction qui à partir d'une ligne d'un fichier Json de créer une dataframe à 
    trois colonnes. La première colonne correspond au numéro de l'annonce, la seconde 
    contient les mots de l'annonce et la troisième les positions du mot dans l'annonce.""" 
    Vect_word=(word_tokenize(eval(line.strip().replace('\xa0',' '))["text"])) # Tokenisation
    nb_sent_list=list(map(int, nb_line*np.ones(len(Vect_word)))) # Numéro annonce
    # Position
    offset = 0                                                                  
    list_pos=list()
    for token in Vect_word:
        offset = eval(line.strip().replace('\xa0',' '))["text"].find(token, offset)
        list_pos.append([offset, offset+len(token)])
        offset += len(token)
    # Creation de la dataframe
    data={'Ad#':nb_sent_list,'Words':Vect_word,'Pos':list_pos}
    df=pd.DataFrame(data)
    return df

def clean_text(text):
    """Fonction qui permet de corriger les annotations qui surlignent un espace blanc"""
    if text[0]==' ':      
        if text[-1]==' ':
            return 3
        return 1
    elif text[-1]==' ':
        return 2
    else :
        return 0
    
def from_line_to_list_label(line):
    """Fonction qui permet de sortir les informations des labels (text, label et positions)
    à partir d'une ligne du fichier json"""
    list_word_label=list()
    for i in range(len(eval(line)["labels"])):
        start=eval(line)["labels"][i][0]
        end=eval(line)["labels"][i][1]
        label=eval(line)["labels"][i][2]
        if (clean_text(eval(line.strip().replace('\xa0',' '))["text"][start:end])==0):
            list_word_label.append([eval(line.strip().replace('\xa0',' '))["text"][start:end],label,start,end])
        elif (clean_text(eval(line.strip().replace('\xa0',' '))["text"][start:end])==1):
            list_word_label.append([eval(line.strip().replace('\xa0',' '))["text"][(start+1):end],label,start+1,end])
        elif (clean_text(eval(line.strip().replace('\xa0',' '))["text"][start:end])==2):
            list_word_label.append([eval(line.strip().replace('\xa0',' '))["text"][start:(end-1)],label,start,end-1])
        else:
            list_word_label.append([eval(line.strip().replace('\xa0',' '))["text"][(start+1):(end-1)],label,start+1,end-1])
    return list_word_label

def column_tag(vect_word,list_word_pos_label):
    """Colonne contenant les labels pour chaque mot d'une annonce"""
    list_tag=["O"]*len(vect_word["Pos"])
    for i in range(len(vect_word["Pos"])):
        for elmt in list_word_pos_label:
            if vect_word["Pos"][i][0]==elmt[2] and vect_word["Pos"][i][1]<=elmt[3]:
                list_tag[i]="B-"+elmt[1]
            elif vect_word["Pos"][i][0]>elmt[2] and vect_word["Pos"][i][1]<=elmt[3]:
                list_tag[i]="I-"+elmt[1]
    return list_tag

# Lecture du fichier Json
cnt = 1 # Numéro annonce
for i in range(1,6):
    with open('data\\doccano\\bdd'+str(i)+'.json1', encoding="utf-8") as fp:
        line = fp.readline()
        if i==1:
            df=from_ad_to_dataframe(line.replace('null','"null"'),cnt)
            list_word_pos_label=from_line_to_list_label(line.replace('null','"null"'))
            list_tag=column_tag(df,list_word_pos_label)
            df["Tag"]=list_tag
        else :
            df_ad=from_ad_to_dataframe(line.replace('null','"null"'),cnt)
            list_word_pos_label=from_line_to_list_label(line.replace('null','"null"'))
            list_tag=column_tag(df_ad,list_word_pos_label)
            df_ad["Tag"]=list_tag
            df=df.append(df_ad, ignore_index = True)
        while line:
           #print("Line {}: {}".format(cnt, line.strip()))
            if cnt!=1:
                df_ad=from_ad_to_dataframe(line.replace('null','"null"'),cnt)
                list_word_pos_label=from_line_to_list_label(line.replace('null','"null"'))
                list_tag=column_tag(df_ad,list_word_pos_label)
                df_ad["Tag"]=list_tag
                df=df.append(df_ad, ignore_index = True)
            line = fp.readline()
            cnt += 1

In [3]:
df

Unnamed: 0,Ad#,Words,Pos,Tag
0,1,Situé,"[0, 5]",O
1,1,à,"[6, 7]",O
2,1,6,"[8, 9]",O
3,1,stations,"[10, 18]",O
4,1,de,"[19, 21]",O
...,...,...,...,...
50303,587,inclus,"[187, 193]",O
50304,587,dans,"[194, 198]",O
50305,587,le,"[199, 201]",O
50306,587,loyer,"[202, 207]",O


In [4]:
# function to get vocab, maxvocab
# takes sents : list (tokenized lists of sentences)
# takes maxvocab : int (maximum vocab size incl. UNK, PAD
# takes stoplist : list (words to ignore)
# returns vocab_dict (word to index), inv_vocab_dict (index to word)
def get_vocab(sent_toks, maxvocab=10000, min_count=1, stoplist=[], unk='UNK', pad='PAD', verbose=True):
    # get vocab list
    vocab = [word for sent in sent_toks for word in sent]
    sorted_vocab = sorted(Counter(vocab).most_common(), key=lambda x: x[1], reverse=True)
    sorted_vocab = [i for i in sorted_vocab if i[0] not in stoplist and i[0] != unk]
    if verbose:
        print("total vocab:", len(sorted_vocab))
    sorted_vocab = [i for i in sorted_vocab if i[1] >= min_count]
    # reserve for PAD and UNK
    sorted_vocab = [i[0] for i in sorted_vocab[:maxvocab - 2]]
    vocab_dict = {k: v + 1 for v, k in enumerate(sorted_vocab)}
    vocab_dict[unk] = len(sorted_vocab) + 1
    vocab_dict[pad] = 0
    inv_vocab_dict = {v: k for k, v in vocab_dict.items()}

    return vocab_dict, inv_vocab_dict


# function to convert sents to indexed vectors
# takes list : sents (tokenized sentences)
# takes dict : vocab (word to idx mapping)
# returns list of lists of indexed sentences
def index_sents(sent_tokens, vocab_dict, reverse=False, unk_name='UNK', verbose=False):
    vectors = []
    for sent in sent_tokens:
        sent_vect = []
        if reverse:
            sent = sent[::-1]
        for word in sent:
            if word in vocab_dict.keys():
                sent_vect.append(vocab_dict[word])
            else:  # out of max_vocab range or OOV
                sent_vect.append(vocab_dict[unk_name])
        vectors.append(np.asarray(sent_vect))
    vectors = np.asarray(vectors)
    return vectors

In [5]:
# set maximum network vocabulary, test set size
MAX_VOCAB = 5000
TEST_SIZE = 0.20

### read ConLL2002 NER corpus from csv (first save as utf-8!)

In [6]:
data = df

In [7]:
data

Unnamed: 0,Ad#,Words,Pos,Tag
0,1,Situé,"[0, 5]",O
1,1,à,"[6, 7]",O
2,1,6,"[8, 9]",O
3,1,stations,"[10, 18]",O
4,1,de,"[19, 21]",O
...,...,...,...,...
50303,587,inclus,"[187, 193]",O
50304,587,dans,"[194, 198]",O
50305,587,le,"[199, 201]",O
50306,587,loyer,"[202, 207]",O


In [8]:
sentmarks = data["Ad#"].tolist()
sentmarks = [str(s) for s in sentmarks]
sentmarks

['1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3'

In [9]:
words = data["Words"].tolist()
nertags = data["Tag"].tolist()

In [10]:
sentence_text = []
sentence_ners = []

vocab = []

this_snt = []
this_ner = []

for idx, s in enumerate(sentmarks):
    # reset if new sent
    if int(s) == int(sentmarks[idx-1])+1 or idx==0 or idx==len(data)-1:
        # edit: ONLY IF HAS TAG!
        if len(this_snt) > 0 and (int(s) == int(sentmarks[idx-1])+1 or idx==len(data)-1):
            if list(set(this_ner)) != ['O']:
                if idx==len(data)-1:
                    this_snt.append(words[idx].lower())
                    this_ner.append(nertags[idx])
                sentence_text.append(this_snt[:-1])
                sentence_ners.append(this_ner[:-1])
        this_snt = []
        this_ner = []
    
    # add to lists 
    this_snt.append(words[idx].lower())
    this_ner.append(nertags[idx])
    vocab.append(words[idx].lower())

In [11]:
for idx, sent in enumerate(sentence_text[:2]):
    print(sent)
    print(sentence_ners[idx])
    print('')

['situé', 'à', '6', 'stations', 'de', 'la', 'porte', 'de', 'choisy', 'par', 'le', 'bus', '183', ',', 'station', 'malassis', ',', 'futur', 'tramway', '9', ',', 'à', 'proximité', 'des', 'commerces', ',', 'dans', 'une', 'résidence', 'avec', 'espace', 'vert', ',', 'gardien', ',', 'digicode', ',', 'interphone', ',', 'un', 'appartement', 'de', '3', 'pièces', ',', 'au', '3ème', 'étage', ',', 'comprenant', ':', 'entrée', ',', 'séjour', ',', '2', 'chambres', ',', 'cuisine', ',', 'salle', 'de', 'bains', ',', 'dégagement', 'et', 'wc', '.', 'un', 'parking', 'et', 'une', 'cave', 'au', 'sous-sol', '-', 'chauffage', 'et', 'eau', 'chaude', 'collectif']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TRANSPORTS_PROXIMITE', 'I-TRANSPORTS_PROXIMITE', 'O', 'O', 'O', 'O', 'O', 'B-TRANSPORTS_PROXIMITE', 'I-TRANSPORTS_PROXIMITE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-N_PIECES', 'O', 'O', 'O', 'B-N_ETAGE', 'O', 'O', 'O', 'O'

In [12]:
len(vocab)

50308

In [13]:
import matplotlib.pyplot as plt
list_size=list()
for elt in sentence_text:
    list_size.append(len(elt))
plt.hist(list_size, range = (min(list_size), max(list_size)), bins = 20)

(array([ 13.,  68., 147., 119.,  80.,  58.,  44.,  18.,  10.,  15.,   4.,
          5.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   1.]),
 array([ 13. ,  30.8,  48.6,  66.4,  84.2, 102. , 119.8, 137.6, 155.4,
        173.2, 191. , 208.8, 226.6, 244.4, 262.2, 280. , 297.8, 315.6,
        333.4, 351.2, 369. ]),
 <a list of 20 Patch objects>)

In [14]:
print(np.mean(list_size))
print(np.quantile(list_size, 0.90))
print(np.quantile(list_size, 0.95))
print(np.quantile(list_size, 0.97))
print(np.quantile(list_size, 0.99))

84.97431506849315
134.70000000000005
164.70000000000005
180.52999999999997
209.33999999999992


## get vocabulary and index inputs

We will index each word from 1 according to inverse frequency (most common word is 1, etc.) until the max-vocab size. We will reserve two slots, 0 for the PAD index, and MAX_VOCAB-1 for out-of-vocabulary or unknown words (UNK).

In [15]:
# text vocab dicts
# subtract 2 for UNK, PAD
word2idx, idx2word = get_vocab(sentence_text, MAX_VOCAB-2)

total vocab: 3413


In [16]:
word2idx['UNK']

3414

In [17]:
# NER tag vocab dicts
ner2idx, idx2ner = get_vocab(sentence_ners, len(set(nertags))+2)

total vocab: 45


In [18]:
# index
sentence_text_idx = index_sents(sentence_text, word2idx)
sentence_ners_idx = index_sents(sentence_ners, ner2idx)

## train-test splitting


In [19]:
indices = [i for i in range(len(sentence_text))]

train_idx, test_idx, y_train_ner, y_test_ner = train_test_split(indices, sentence_ners_idx, test_size=TEST_SIZE)

def get_sublist(lst, indices):
    result = []
    for idx in indices:
        result.append(lst[idx])
    return result

X_train_sents = get_sublist(sentence_text_idx, train_idx)
X_test_sents = get_sublist(sentence_text_idx, test_idx)

In [20]:
# network hyperparameters
MAX_LENGTH = 170 # par rapport au graphique taille
MAX_VOCAB = 5000    
WORDEMBED_SIZE = 300 
HIDDEN_SIZE = 400    # LSTM Nodes/Features/Dimension
BATCH_SIZE = 64
DROPOUTRATE = 0.25
MAX_EPOCHS = 8       #

we must 'pad' our input and output sequences to a fixed length due to Tensorflow's fixed-graph representation.

In [21]:
# zero-pad the sequences to max length
print("zero-padding sequences...\n")
X_train_sents = sequence.pad_sequences(X_train_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_sents = sequence.pad_sequences(X_test_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_train_ner = sequence.pad_sequences(y_train_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_test_ner = sequence.pad_sequences(y_test_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')

zero-padding sequences...



In [22]:
X_train_sents[2]

array([  43,  128,  152,   63,  211,  138,   24,  392,   11,  108,   27,
         15,   23,   15,   19,   15,   31,   35,   15,   12,  117,   40,
         15,   10,    2,  118,   15,   13,   88,   15,  199,    4,  131,
         15,    7,   37,    4,    5,   46,   11,   66,    3,   58,   64,
          4,  115,   16,  410,   17,  140,    1,  119,   18,    3,   50,
       3228,  225,  194,    3,   94,    2,  100,    3,  116,  761,   16,
         45,  352,    4,  141,  389,   71,  170,    2,  700,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [23]:
# get the size of ner tags
NER_VOCAB = len(list(idx2ner.keys()))

In [24]:
# reshape data for CRF
y_train_ner = y_train_ner[:, :, np.newaxis]
y_test_ner = y_test_ner[:, :, np.newaxis]

In [25]:
# text layers 
txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')
txt_embed = Embedding(MAX_VOCAB, WORDEMBED_SIZE, input_length=MAX_LENGTH,
                      name='txt_embedding', trainable=True, mask_zero=True)(txt_input)

txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)

mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='mrg_bidirectional_1')(txt_drpot)#(mrg_cncat)

# extra LSTM layer, if wanted
mrg_drpot = Dropout(DROPOUTRATE, name='mrg_dropout')(mrg_lstml)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='mrg_bidirectional_2')(mrg_lstml)

# final linear chain CRF layer
crf = CRF(NER_VOCAB, sparse_target=True)
mrg_chain = crf(mrg_lstml)

model = Model(inputs=[txt_input], outputs=mrg_chain)

model.compile(optimizer='adam',
              loss=crf.loss_function,
              metrics=[crf.accuracy])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





In [26]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
txt_input (InputLayer)       (None, 170)               0         
_________________________________________________________________
txt_embedding (Embedding)    (None, 170, 300)          1500000   
_________________________________________________________________
txt_dropout (Dropout)        (None, 170, 300)          0         
_________________________________________________________________
mrg_bidirectional_1 (Bidirec (None, 170, 800)          2243200   
_________________________________________________________________
mrg_bidirectional_2 (Bidirec (None, 170, 800)          3843200   
_________________________________________________________________
crf_1 (CRF)                  (None, 170, 47)           39950     
Total params: 7,626,350
Trainable params: 7,626,350
Non-trainable params: 0
_________________________________________________________________


In [27]:
history = model.fit([X_train_sents], y_train_ner,
                    batch_size=BATCH_SIZE,
                    epochs=MAX_EPOCHS//4,
                    verbose=2)


Epoch 1/2
 - 163s - loss: 8.1182 - crf_viterbi_accuracy: 0.6147
Epoch 2/2
 - 174s - loss: 6.5720 - crf_viterbi_accuracy: 0.8715


In [28]:
hist_dict = history.history

In [29]:
preds = model.predict([X_test_sents])

In [30]:
len(preds[0][0])

47

In [31]:
preds = np.argmax(preds, axis=-1)
preds.shape

(117, 170)

In [32]:
trues = np.squeeze(y_test_ner, axis=-1)
trues.shape

(117, 170)

In [33]:
s_preds = [[idx2ner[t] for t in s] for s in preds]

In [34]:
s_trues = [[idx2ner[t] for t in s] for s in trues]

In [35]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
def bio_classification_report(y_true, y_pred):
    """
    from scrapinghub's python-crfsuite example
    
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O', 'PAD'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [36]:
print(bio_classification_report(s_trues, s_preds))

                          precision    recall  f1-score   support

               B-ADRESSE       0.00      0.00      0.00        38
               I-ADRESSE       0.00      0.00      0.00        82
    B-ANNEE_CONSTRUCTION       0.00      0.00      0.00        14
    I-ANNEE_CONSTRUCTION       0.00      0.00      0.00         6
        B-AVEC_ASCENSEUR       0.00      0.00      0.00        33
        I-AVEC_ASCENSEUR       0.00      0.00      0.00        24
B-CHARGES_LOCATAIRE_MOIS       0.00      0.00      0.00        18
           B-CODE_POSTAL       0.00      0.00      0.00         7
           B-COPROPRIETE       0.00      0.00      0.00        14
            B-DATE_DISPO       0.00      0.00      0.00        36
            I-DATE_DISPO       0.00      0.00      0.00        28
        B-DEPOT_GARANTIE       0.00      0.00      0.00        13
        I-DEPOT_GARANTIE       0.00      0.00      0.00         5
             B-EXTERIEUR       0.00      0.00      0.00        60
         

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [37]:
decoded = []
for sent_idx in range(len(X_test_sents[:500])):
    
    this_txt = sequence.pad_sequences([X_test_sents[sent_idx]], maxlen=MAX_LENGTH, truncating='post', padding='post')
    this_pred = model.predict([this_txt])
    this_pred = [np.argmax(p) for p in this_pred[0]]
    np.shape(this_pred)

    word, tru, prd = [], [], []

    # for each word in the sentence...
    for idx, wordid in enumerate(X_test_sents[sent_idx][:len(this_pred)]):

        # decode word
        word.append(idx2word[wordid])
        # decode true NER tag
        tru.append(idx2ner[int(y_test_ner[sent_idx][idx])])
        # decode prediction
        prd.append(idx2ner[this_pred[idx]])

    answ = pd.DataFrame(
    {
        'word': word,
        'true': tru,
        'pred': prd,
        'skip' : [' ' for s in word]
    })
    answ = answ[['word', 'true', 'pred', 'skip']]
    answ = answ.T
    decoded.append(answ)

In [38]:
result = pd.concat(decoded)
result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160,161,162,163,164,165,166,167,168,169
word,``,colocation,acceptée,-,a,5,min,de,la,gare,...,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
true,O,O,O,O,O,O,O,O,O,B-TRANSPORTS_PROXIMITE,...,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
pred,O,O,O,O,O,O,O,O,O,O,...,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
skip,,,,,,,,,,,...,,,,,,,,,,
word,nous,vous,proposons,un,appartement,de,3,pièces,situé,à,...,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
skip,,,,,,,,,,,...,,,,,,,,,,
word,a,10,minutes,en,bus,du,rer,b,robinson,et,...,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
true,O,O,O,O,O,O,B-TRANSPORTS_PROXIMITE,O,O,O,...,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
pred,O,O,O,O,O,O,O,O,O,O,...,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD,PAD
