# Obtenção e organização dos dados

In [1]:
import pandas as pd

df = pd.read_csv('../dados/jurisprudencia_selecionada_excertos.CSV', sep = ';')[['DESCR_AREA', 'TEXTO_EXCERTO']]
df.head()

Unnamed: 0,DESCR_AREA,TEXTO_EXCERTO
0,Responsabilidade,Voto:Cuidam os autos de tomada de contas espec...
1,Finanças Públicas,Voto:Cuidam os autos de Solicitação do Congres...
2,Responsabilidade,Relatório:Trata-se de embargos de declaração o...
3,Direito Processual,Voto:8. Em relação a outros processos judiciai...
4,Pessoal,Voto:11. Relativamente ao ato envolvendo a Sra...


In [2]:
df.shape

(13285, 2)

In [3]:
from sklearn.preprocessing import LabelBinarizer

areas = df.groupby(['DESCR_AREA']).groups.keys()
lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
y = lbArea.transform(df['DESCR_AREA'])
lbArea.classes_, y.shape

(array(['Competência do TCU', 'Contrato Administrativo', 'Convênio',
        'Desestatização', 'Direito Processual', 'Finanças Públicas',
        'Gestão Administrativa', 'Licitação', 'Pessoal',
        'Responsabilidade'], dtype='<U23'), (13285, 10))

# Pré-processamento

In [4]:
colunas_scores = list(lbArea.classes_)
colunas_scores.extend(['accuracy', 'macro avg', 'weighted avg'])

In [7]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

print('\tCarregamento do modelo de acordaos...')
modelo = Word2Vec.load('../vocabularios/modelo-acordaos.w2v')

	Carregamento do modelo de acordaos...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

df_scores = pd.DataFrame()

limite_texto = 2000
dim_vetor = 100

print('\tTokenizacao e montagem de sequencias...')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['TEXTO_EXCERTO'])
vocabulario = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(df['TEXTO_EXCERTO'])

print('\tMontagem da matriz de embeddings...')
embedding_matrix = np.zeros((vocabulario, dim_vetor))
for word, i in tokenizer.word_index.items():
    if word in modelo.wv:
        embedding_matrix[i] = modelo.wv[word]

	Tokenizacao e montagem de sequencias...
	Montagem da matriz de embeddings...


# Treinamentos com Cross-validation

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Flatten, GRU
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint

fold = 0
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    str_fold = 'Fold ' + str(fold)
    print()
    print()
    print(str_fold + ':')
    print('\tDefinicao de valores de entrada e saida da rede...')

    df_train = df.loc[train_index]
    df_val = df.loc[val_index]

    sequences_train = tokenizer.texts_to_sequences(df_train['TEXTO_EXCERTO'])
    sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_EXCERTO'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)

    y_train = lbArea.transform(df_train['DESCR_AREA'])
    y_val = lbArea.transform(df_val['DESCR_AREA'])

    print('\tTreinamento da rede...')
    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, input_length=limite_texto, trainable=True,  weights=[embedding_matrix]))
    model.add(Conv1D(32, 7, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Conv1D(32, 7, activation='relu'))
    model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=['categorical_accuracy'])

    checkpoint_filename = 'weights' + str(fold) + '.hdf5'
    checkpointer = ModelCheckpoint(filepath=checkpoint_filename, monitor='val_categorical_accuracy', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])

    print('\n\tAvaliacao do melhor modelo and registro dos scores...')
    model.load_weights(checkpoint_filename)
    y_val_pred = model.predict_classes(x_val, verbose=1)
    y_val_i = [list(x).index(1) for x in y_val]
    report = classification_report(y_val_i, y_val_pred, target_names=lbArea.classes_, output_dict = True)
    for col in colunas_scores:
        if col == 'accuracy':
            f = report[col]
        else:
            f = report[col]['f1-score']
        df_scores.loc[str_fold,col] = f
    fold += 1



Fold 0:
	Definicao de valores de entrada e saida da rede...


W0322 13:36:41.725331 140548442859328 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0322 13:36:41.736631 140548442859328 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0322 13:36:41.741533 140548442859328 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0322 13:36:41.781068 140548442859328 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprec

	Treinamento da rede...


W0322 13:36:42.761628 140548442859328 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0322 13:36:42.886315 140548442859328 deprecation.py:506] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0322 13:36:43.323782 140548442859328 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0322 13:36:43.424240 140548442859328 deprecation.py:323] From /home/leonardo/anaconda3/envs/

Train on 11956 samples, validate on 1329 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.52897, saving model to weights0.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.52897 to 0.62904, saving model to weights0.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.62904 to 0.67645, saving model to weights0.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.67645 to 0.70128, saving model to weights0.hdf5
Epoch 5/10

Epoch 00005: val_categorical_accuracy improved from 0.70128 to 0.71407, saving model to weights0.hdf5
Epoch 6/10

Epoch 00006: val_categorical_accuracy improved from 0.71407 to 0.73138, saving model to weights0.hdf5
Epoch 7/10

Epoch 00007: val_categorical_accuracy improved from 0.73138 to 0.74567, saving model to weights0.hdf5
Epoch 8/10

Epoch 00008: val_categorical_accuracy improved from 0.74567 to 0.74793, saving model to weights0.hdf5
Epoch 9/10

Epoch 00009: val_categorical_a


Epoch 00009: val_categorical_accuracy did not improve from 0.78555
Epoch 10/10

Epoch 00010: val_categorical_accuracy did not improve from 0.78555

	Avaliacao do melhor modelo and registro dos scores...


Fold 3:
	Definicao de valores de entrada e saida da rede...
	Treinamento da rede...
Train on 11956 samples, validate on 1329 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.57261, saving model to weights3.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.57261 to 0.61400, saving model to weights3.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.61400 to 0.64108, saving model to weights3.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.64108 to 0.69752, saving model to weights3.hdf5
Epoch 5/10

Epoch 00005: val_categorical_accuracy improved from 0.69752 to 0.72460, saving model to weights3.hdf5
Epoch 6/10

Epoch 00006: val_categorical_accuracy improved from 0.72460 to 0.74643, saving mod

Epoch 7/10

Epoch 00007: val_categorical_accuracy did not improve from 0.77937
Epoch 8/10

Epoch 00008: val_categorical_accuracy improved from 0.77937 to 0.78313, saving model to weights5.hdf5
Epoch 9/10

Epoch 00009: val_categorical_accuracy improved from 0.78313 to 0.79292, saving model to weights5.hdf5
Epoch 10/10

Epoch 00010: val_categorical_accuracy did not improve from 0.79292

	Avaliacao do melhor modelo and registro dos scores...


Fold 6:
	Definicao de valores de entrada e saida da rede...
	Treinamento da rede...
Train on 11957 samples, validate on 1328 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.56401, saving model to weights6.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.56401 to 0.61521, saving model to weights6.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.61521 to 0.68449, saving model to weights6.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.68449 to 0.69654,

Epoch 5/10

Epoch 00005: val_categorical_accuracy did not improve from 0.73720
Epoch 6/10

Epoch 00006: val_categorical_accuracy improved from 0.73720 to 0.74774, saving model to weights8.hdf5
Epoch 7/10

Epoch 00007: val_categorical_accuracy did not improve from 0.74774
Epoch 8/10

Epoch 00008: val_categorical_accuracy improved from 0.74774 to 0.75000, saving model to weights8.hdf5
Epoch 9/10

Epoch 00009: val_categorical_accuracy improved from 0.75000 to 0.76732, saving model to weights8.hdf5
Epoch 10/10

Epoch 00010: val_categorical_accuracy did not improve from 0.76732

	Avaliacao do melhor modelo and registro dos scores...


Fold 9:
	Definicao de valores de entrada e saida da rede...
	Treinamento da rede...
Train on 11957 samples, validate on 1328 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.60392, saving model to weights9.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.60392 to 0.67620, saving model to weights9.hdf5
Epoc

  'precision', 'predicted', average, warn_for)


# Exportação dos resultados

In [10]:
df_medias = pd.DataFrame()
df_medias['mean'] = df_scores.mean()
df_medias['std'] = df_scores.std()
df_medias.T

Unnamed: 0,Competência do TCU,Contrato Administrativo,Convênio,Desestatização,Direito Processual,Finanças Públicas,Gestão Administrativa,Licitação,Pessoal,Responsabilidade,accuracy,macro avg,weighted avg
mean,0.500756,0.629869,0.51817,0.252624,0.751895,0.56083,0.584903,0.836726,0.951267,0.739474,0.778774,0.632651,0.773068
std,0.106851,0.04145,0.054284,0.160289,0.019285,0.053011,0.100507,0.019427,0.009269,0.027705,0.015527,0.024342,0.015854


In [11]:
df_medias.T.to_csv('scores_excerto_original.csv', encoding = 'Latin1')