# Obtenção e organização dos dados

In [1]:
import pandas as pd

df = pd.read_csv('../dados/acordaos-unicos-filtrados-6000.csv', sep = '|')[['acordao', 'areas', 'filtrado_6000']]
df['filtrado_6000'] = df['filtrado_6000'].astype(str)
df.head()

Unnamed: 0,acordao,areas,filtrado_6000
0,297/2016-P,Responsabilidade,tribunal conta união tc grupo classe plenário ...
1,366/2016-P,Finanças Públicas,tribunal conta união tc grupo classe ii plenár...
2,944/2016-P,Responsabilidade,tribunal conta união tc grupo classe plenário ...
3,30/2016-P,Direito Processual,tribunal conta união tc grupo classe plenário ...
4,55/2016-P,Pessoal,wania lucia pasquarelli nascimentotcuwania luc...


In [2]:
df.shape

(9739, 3)

In [3]:
from sklearn.preprocessing import LabelBinarizer

areas = df.groupby(['areas']).groups.keys()
lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
y = lbArea.transform(df['areas'])
lbArea.classes_, y.shape

(array(['Competência do TCU', 'Contrato Administrativo', 'Convênio',
        'Desestatização', 'Direito Processual', 'Finanças Públicas',
        'Gestão Administrativa', 'Licitação', 'Pessoal',
        'Responsabilidade'], dtype='<U23'), (9739, 10))

# Pré-processamento

In [4]:
colunas_scores = list(lbArea.classes_)
colunas_scores.extend(['accuracy', 'macro avg', 'weighted avg'])

In [5]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

print('\tCarregamento do modelo de acordaos...')
modelo = Word2Vec.load('../vocabularios/modelo-acordaos-50.w2v')

	Carregamento do modelo de acordaos...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

df_scores = pd.DataFrame()

limite_texto = 6000
dim_vetor = 50

print('\tTokenizacao e montagem de sequencias...')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['filtrado_6000'])
vocabulario = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(df['filtrado_6000'])

print('\tMontagem da matriz de embeddings...')
embedding_matrix = np.zeros((vocabulario, dim_vetor))
for word, i in tokenizer.word_index.items():
    if word in modelo.wv:
        embedding_matrix[i] = modelo.wv[word]

Using TensorFlow backend.


	Tokenizacao e montagem de sequencias...
	Montagem da matriz de embeddings...


# Treinamentos com Cross-validation

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Flatten, GRU
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint

fold = 0
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    str_fold = 'Fold ' + str(fold)
    print()
    print()
    print(str_fold + ':')
    print('\tDefinicao de valores de entrada e saida da rede...')

    df_train = df.loc[train_index]
    df_val = df.loc[val_index]

    sequences_train = tokenizer.texts_to_sequences(df_train['filtrado_6000'])
    sequences_val = tokenizer.texts_to_sequences(df_val['filtrado_6000'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)

    y_train = lbArea.transform(df_train['areas'])
    y_val = lbArea.transform(df_val['areas'])

    print('\tTreinamento da rede...')
    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, input_length=limite_texto, trainable=True,  weights=[embedding_matrix]))
    model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=['categorical_accuracy'])

    checkpoint_filename = 'weights' + str(fold) + '.hdf5'
    checkpointer = ModelCheckpoint(filepath=checkpoint_filename, monitor='val_categorical_accuracy', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])

    print('\n\tAvaliacao do melhor modelo and registro dos scores...')
    model.load_weights(checkpoint_filename)
    y_val_pred = model.predict_classes(x_val, verbose=1)
    y_val_i = [list(x).index(1) for x in y_val]
    report = classification_report(y_val_i, y_val_pred, target_names=lbArea.classes_, output_dict = True)
    for col in colunas_scores:
        if col == 'accuracy':
            f = report[col]
        else:
            f = report[col]['f1-score']
        df_scores.loc[str_fold,col] = f
    fold += 1



Fold 0:
	Definicao de valores de entrada e saida da rede...


W0319 17:46:58.701393 139883022051136 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0319 17:46:58.712806 139883022051136 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0319 17:46:58.714306 139883022051136 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0319 17:46:58.845228 139883022051136 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprec

	Treinamento da rede...


W0319 17:46:59.947525 139883022051136 deprecation.py:506] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0319 17:47:00.149991 139883022051136 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0319 17:47:00.672403 139883022051136 deprecation.py:323] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0,

Train on 8765 samples, validate on 974 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.28953, saving model to weights0.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.28953 to 0.29877, saving model to weights0.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.29877 to 0.34805, saving model to weights0.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.34805 to 0.44353, saving model to weights0.hdf5
Epoch 5/10

Epoch 00005: val_categorical_accuracy improved from 0.44353 to 0.57187, saving model to weights0.hdf5
Epoch 6/10

Epoch 00006: val_categorical_accuracy improved from 0.57187 to 0.61602, saving model to weights0.hdf5
Epoch 7/10

Epoch 00007: val_categorical_accuracy improved from 0.61602 to 0.63142, saving model to weights0.hdf5
Epoch 8/10

Epoch 00008: val_categorical_accuracy improved from 0.63142 to 0.63142, saving model to weights0.hdf5
Epoch 9/10

Epoch 00009: val_categorical_acc

  'precision', 'predicted', average, warn_for)


	Treinamento da rede...
Train on 8765 samples, validate on 974 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.29671, saving model to weights1.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.29671 to 0.31930, saving model to weights1.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.31930 to 0.32238, saving model to weights1.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.32238 to 0.36448, saving model to weights1.hdf5
Epoch 5/10

Epoch 00005: val_categorical_accuracy improved from 0.36448 to 0.50205, saving model to weights1.hdf5
Epoch 6/10

Epoch 00006: val_categorical_accuracy improved from 0.50205 to 0.57803, saving model to weights1.hdf5
Epoch 7/10

Epoch 00007: val_categorical_accuracy improved from 0.57803 to 0.63963, saving model to weights1.hdf5
Epoch 8/10

Epoch 00008: val_categorical_accuracy improved from 0.63963 to 0.65092, saving model to weights1.hdf5
Epoch 9/10

Epoch 00

Epoch 9/10

Epoch 00009: val_categorical_accuracy improved from 0.64374 to 0.65708, saving model to weights3.hdf5
Epoch 10/10

Epoch 00010: val_categorical_accuracy improved from 0.65708 to 0.66940, saving model to weights3.hdf5

	Avaliacao do melhor modelo and registro dos scores...


Fold 4:
	Definicao de valores de entrada e saida da rede...


  'precision', 'predicted', average, warn_for)


	Treinamento da rede...
Train on 8765 samples, validate on 974 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.27515, saving model to weights4.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.27515 to 0.30390, saving model to weights4.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.30390 to 0.31725, saving model to weights4.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.31725 to 0.39528, saving model to weights4.hdf5
Epoch 5/10

Epoch 00005: val_categorical_accuracy improved from 0.39528 to 0.55236, saving model to weights4.hdf5
Epoch 6/10

Epoch 00006: val_categorical_accuracy improved from 0.55236 to 0.59959, saving model to weights4.hdf5
Epoch 7/10

Epoch 00007: val_categorical_accuracy improved from 0.59959 to 0.62628, saving model to weights4.hdf5
Epoch 8/10

Epoch 00008: val_categorical_accuracy improved from 0.62628 to 0.64066, saving model to weights4.hdf5
Epoch 9/10

Epoch 00

Epoch 9/10

Epoch 00009: val_categorical_accuracy improved from 0.65298 to 0.66016, saving model to weights6.hdf5
Epoch 10/10

Epoch 00010: val_categorical_accuracy improved from 0.66016 to 0.66940, saving model to weights6.hdf5

	Avaliacao do melhor modelo and registro dos scores...


Fold 7:
	Definicao de valores de entrada e saida da rede...
	Treinamento da rede...
Train on 8765 samples, validate on 974 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.28645, saving model to weights7.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.28645 to 0.30903, saving model to weights7.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.30903 to 0.33778, saving model to weights7.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.33778 to 0.40041, saving model to weights7.hdf5
Epoch 5/10

Epoch 00005: val_categorical_accuracy improved from 0.40041 to 0.53183, saving model to weights7.hdf5
Epoch 6/10

Epo


Epoch 00006: val_categorical_accuracy improved from 0.60946 to 0.63104, saving model to weights9.hdf5
Epoch 7/10

Epoch 00007: val_categorical_accuracy improved from 0.63104 to 0.63926, saving model to weights9.hdf5
Epoch 8/10

Epoch 00008: val_categorical_accuracy improved from 0.63926 to 0.65262, saving model to weights9.hdf5
Epoch 9/10

Epoch 00009: val_categorical_accuracy improved from 0.65262 to 0.67112, saving model to weights9.hdf5
Epoch 10/10

Epoch 00010: val_categorical_accuracy improved from 0.67112 to 0.67318, saving model to weights9.hdf5

	Avaliacao do melhor modelo and registro dos scores...


# Exportação dos resultados

In [8]:
df_medias = pd.DataFrame()
df_medias['mean'] = df_scores.mean()
df_medias['std'] = df_scores.std()
df_medias.T

Unnamed: 0,Competência do TCU,Contrato Administrativo,Convênio,Desestatização,Direito Processual,Finanças Públicas,Gestão Administrativa,Licitação,Pessoal,Responsabilidade,accuracy,macro avg,weighted avg
mean,0.036714,0.232627,0.099641,0.290263,0.549983,0.38059,0.120238,0.758794,0.938752,0.576538,0.667523,0.398414,0.634152
std,0.050664,0.095432,0.055103,0.161434,0.034867,0.05274,0.093244,0.016677,0.006688,0.045975,0.013289,0.029673,0.013404


In [9]:
df_medias.T.to_csv('scores_recorrente.csv', encoding = 'Latin1')