# Importação e preparação dos dados

In [1]:
import pandas as pd

df = pd.read_csv('../dados/acordaos-selecionada.csv', sep = '|')
df.head()

Unnamed: 0,acordao,arquivo,texto,areas
0,297/2016-P,547240.txt,TRIBUNAL DE CONTAS DA UNIÃO\tTC 010.084/2015-0...,Responsabilidade
1,366/2016-P,549518.txt,TRIBUNAL DE CONTAS DA UNIÃO\tTC 005.933/2014-5...,Finanças Públicas
2,944/2016-P,554399.txt,TRIBUNAL DE CONTAS DA UNIÃO\tTC 042.038/2012-0...,Responsabilidade
3,30/2016-P,545010.txt,TRIBUNAL DE CONTAS DA UNIÃO\tTC 000.742/2014-7...,Direito Processual
4,55/2016-P,544046.txt,;-;;Wania Lucia Pasquarelli do NascimentoTCUWa...,Pessoal


In [2]:
df.shape

(10524, 4)

In [3]:
from sklearn.preprocessing import LabelBinarizer

areas = df.groupby(['areas']).groups.keys()
lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
y = lbArea.transform(df['areas'])
y.shape

(10524, 91)

In [4]:
from keras.preprocessing.text import Tokenizer
import numpy as np

vocabulario = 350000
limite_texto = 40000
dim_vetor = 100

tokenizer = Tokenizer(num_words=vocabulario)
tokenizer.fit_on_texts(df['texto'])

sequences = tokenizer.texts_to_sequences(df['texto'])

word_index = tokenizer.word_index
vocabulario = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 344296 unique tokens.


In [5]:
from keras.preprocessing.sequence import pad_sequences

x = pad_sequences(sequences, maxlen=limite_texto)

print('Shape of data tensor:', x.shape)

Shape of data tensor: (10524, 40000)


In [6]:
x.shape, y.shape

((10524, 40000), (10524, 91))

# Treinamento

In [7]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Flatten
from keras.optimizers import RMSprop
from keras.layers.core import Dropout

In [8]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, input_length=x.shape[1]))
model.add(Conv1D(64, 7, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()
history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

W0309 21:10:32.963485 140257104439104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0309 21:10:32.996397 140257104439104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0309 21:10:33.011677 140257104439104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0309 21:10:33.038041 140257104439104 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. P

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40000, 100)        34429700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 39994, 64)         44864     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 7998, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 511872)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131039488 
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 91)                23387     
Total para

In [9]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, input_length=x.shape[1]))
model.add(Conv1D(64, 7, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(32, 7, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=["categorical_accuracy"])
model.summary()
history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40000, 100)        34429700  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 39994, 64)         44864     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 7998, 64)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 7992, 32)          14368     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1598, 32)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 51136)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              52364288  
__________

In [10]:
model = Sequential()
model.add(Embedding(vocabulario, dim_vetor, input_length=x.shape[1]))
model.add(Conv1D(32, 7, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(32, 7, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=5e-3),  metrics=["categorical_accuracy"])
model.summary()
history = model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1, shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 40000, 100)        34429700  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 39994, 32)         22432     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 7998, 32)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 7992, 32)          7200      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 3996, 32)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 127872)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               32735488  
__________

KeyboardInterrupt: 