# Importação e preparação dos dados

In [1]:
import pandas as pd

df = pd.read_csv('../dados/excertos_filtrados500.csv', sep = '|')
df.head()

Unnamed: 0,COD,DESCR_AREA,filtrado
0,1400,Responsabilidade,voto cuidar auto tomada conta especial instaur...
1,1700,Finanças Públicas,voto cuidar auto solicitação congresso naciona...
2,5700,Responsabilidade,relatório tratar embargo declaração opor exemp...
3,284,Direito Processual,voto relação outro processo judiciais tratar r...
4,298,Pessoal,voto relativo ato envolver senhor caber rememo...


In [2]:
df.shape

(13285, 3)

In [3]:
from sklearn.preprocessing import LabelBinarizer

areas = df.groupby(['DESCR_AREA']).groups.keys()
lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
y = lbArea.transform(df['DESCR_AREA'])
y.shape

(13285, 10)

In [4]:
from keras.preprocessing.text import Tokenizer
import numpy as np

vocabulario = 30000
limite_texto = 500
dim_vetor = 100

tokenizer = Tokenizer(num_words=vocabulario)
tokenizer.fit_on_texts(df['filtrado'].astype(str))

sequences = tokenizer.texts_to_sequences(df['filtrado'].astype(str))

word_index = tokenizer.word_index
vocabulario = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 22972 unique tokens.


In [5]:
from keras.preprocessing.sequence import pad_sequences

x = pad_sequences(sequences, maxlen=limite_texto)

print('Shape of data tensor:', x.shape)

Shape of data tensor: (13285, 500)


In [6]:
from gensim.models import Word2Vec

model = Word2Vec.load('../vocabularios/modelo-acordaos.w2v')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
# create a weight matrix for words in training docs

embedding_matrix = np.zeros((vocabulario, dim_vetor))

ok = 0
for word, i in tokenizer.word_index.items():
    if word in model:
        embedding_matrix[i] = model[word]
        ok += 1
print('Vocabulario:', i)
print('Encontrados no modelo:', ok, '=', ok * 100. / i)

Vocabulario: 22972
Encontrados no modelo: 18860 = 82.09994776249347


  import sys
  


# Treinamento

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional
from keras.optimizers import RMSprop
from keras.layers.core import Dropout

model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=False))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

W1127 13:58:04.142743 140001869367104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1127 13:58:04.210463 140001869367104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1127 13:58:04.212082 140001869367104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1127 13:58:04.219673 140001869367104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprec

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_1 (GRU)                  (None, 256)               274176    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                2570      
Total params: 2,574,046
Trainable params: 276,746
Non-trainable params: 2,297,300
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [9]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_2 (GRU)                  (None, 256)               274176    
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                650       
Total params: 2,588,574
Trainable params: 2,588,574
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 

In [10]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_3 (GRU)                  (None, 256)               274176    
_________________________________________________________________
dense_4 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                330       
Total params: 2,580,030
Trainable params: 2,580,030
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 

In [11]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_4 (GRU)                  (None, 256)               274176    
_________________________________________________________________
dense_6 (Dense)              (None, 10)                2570      
Total params: 2,574,046
Trainable params: 2,574,046
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(GRU(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_5 (GRU)                  (None, 500, 256)          274176    
_________________________________________________________________
gru_6 (GRU)                  (None, 64)                61632     
_________________________________________________________________
dense_7 (Dense)              (None, 10)                650       
Total params: 2,633,758
Trainable params: 2,633,758
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19

In [13]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_7 (GRU)                  (None, 512)               941568    
_________________________________________________________________
dense_8 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                2570      
Total params: 3,372,766
Trainable params: 3,372,766
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 

In [14]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_8 (GRU)                  (None, 512)               941568    
_________________________________________________________________
dense_10 (Dense)             (None, 128)               65664     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                1290      
Total params: 3,305,822
Trainable params: 3,305,822
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 

In [15]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_9 (GRU)                  (None, 512)               941568    
_________________________________________________________________
dense_12 (Dense)             (None, 64)                32832     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 10)                650       
Total params: 3,272,350
Trainable params: 3,272,350
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 

In [16]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 500, 100)          2297300   
_________________________________________________________________
gru_10 (GRU)                 (None, 512)               941568    
_________________________________________________________________
dense_14 (Dense)             (None, 10)                5130      
Total params: 3,243,998
Trainable params: 3,243,998
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(GRU(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 500, 100)          2297300   
_________________________________________________________________
gru_11 (GRU)                 (None, 500, 512)          941568    
_________________________________________________________________
gru_12 (GRU)                 (None, 128)               246144    
_________________________________________________________________
dense_15 (Dense)             (None, 10)                1290      
Total params: 3,486,302
Trainable params: 3,486,302
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19

In [18]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(LSTM(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 100)          2297300   
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1255424   
_________________________________________________________________
dense_16 (Dense)             (None, 10)                5130      
Total params: 3,557,854
Trainable params: 3,557,854
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()

history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 100)          2297300   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024)              2510848   
_________________________________________________________________
dense_17 (Dense)             (None, 10)                10250     
Total params: 4,818,398
Trainable params: 4,818,398
Non-trainable params: 0
_________________________________________________________________
Train on 10628 samples, validate on 2657 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
