# Obtenção e organização dos dados

In [1]:
import pandas as pd

df = pd.read_csv('../dados/excertos_filtrados500.csv', sep = '|')[['DESCR_AREA', 'filtrado']]
df.head()

Unnamed: 0,DESCR_AREA,filtrado
0,Responsabilidade,voto cuidar auto tomada conta especial instaur...
1,Finanças Públicas,voto cuidar auto solicitação congresso naciona...
2,Responsabilidade,relatório tratar embargo declaração opor exemp...
3,Direito Processual,voto relação outro processo judiciais tratar r...
4,Pessoal,voto relativo ato envolver senhor caber rememo...


In [2]:
df.shape

(13285, 2)

In [3]:
from sklearn.preprocessing import LabelBinarizer

areas = df.groupby(['DESCR_AREA']).groups.keys()
lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
y = lbArea.transform(df['DESCR_AREA'])
lbArea.classes_, y.shape

(array(['Competência do TCU', 'Contrato Administrativo', 'Convênio',
        'Desestatização', 'Direito Processual', 'Finanças Públicas',
        'Gestão Administrativa', 'Licitação', 'Pessoal',
        'Responsabilidade'], dtype='<U23'), (13285, 10))

# Pré-processamento

In [4]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

print('\tCarregamento do modelo de acordaos...')
modelo = Word2Vec.load('../vocabularios/modelo-acordaos.w2v')

	Carregamento do modelo de acordaos...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
import numpy as np
from keras.preprocessing.text import Tokenizer

limite_texto = 500
dim_vetor = 100

print('\tTokenizacao e montagem de sequencias...')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['filtrado'])
vocabulario = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(df['filtrado'])

print('\tMontagem da matriz de embeddings...')
embedding_matrix = np.zeros((vocabulario, dim_vetor))
for word, i in tokenizer.word_index.items():
    if word in modelo.wv:
        embedding_matrix[i] = modelo.wv[word]

	Tokenizacao e montagem de sequencias...
	Montagem da matriz de embeddings...


# Treinamentos com Cross-validation

### Modelos a serem comparados:

* Teste 1 - Excertos filtrados (500) com rede recorrente
* Teste 2 - Excertos filtrados (500) com rede convolucional

In [8]:
colunas_scores = list(lbArea.classes_)
colunas_scores.extend(['accuracy', 'macro avg', 'weighted avg'])
alternativas = ['rede recorrente sobre texto filtrado', 'rede convolucional sobre texto filtrado']

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Flatten, GRU
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

df_medias = pd.DataFrame()
for alt in range(0, 2):
    str_alt = str(alt)
    fold = 0
    df_scores = pd.DataFrame()

    print('\n\nAlternativa', alt+1, '-', alternativas[alt])
    
    for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
        str_fold = 'Fold ' + str(fold)
        print()
        print()
        print('Alternativa ' + alternativas[alt], '-', str_fold + ':')
        print('\tDefinicao de valores de entrada e saida da rede...')

        df_train = df.loc[train_index]
        df_val = df.loc[val_index]

            
        sequences_train = tokenizer.texts_to_sequences(df_train['filtrado'])
        sequences_val = tokenizer.texts_to_sequences(df_val['filtrado'])

        x_train = pad_sequences(sequences_train, maxlen=limite_texto)
        x_val = pad_sequences(sequences_val, maxlen=limite_texto)

        y_train = lbArea.transform(df_train['DESCR_AREA'])
        y_val = lbArea.transform(df_val['DESCR_AREA'])

        print('\tTreinamento da rede...')
        model = Sequential()
        model.add(Embedding(vocabulario, dim_vetor, input_length=limite_texto, trainable=True,  weights=[embedding_matrix]))
        
        
        if alt == 0:
            model.add(GRU(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
            model.add(GRU(64, dropout=0.2, recurrent_dropout=0.2))
        else:
            model.add(Conv1D(64, 7, activation='relu'))
            model.add(MaxPooling1D(5))
            model.add(Conv1D(32, 7, activation='relu'))
            model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
            
        model.add(Dense(y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=['categorical_accuracy'])

        checkpoint_filename = 'weights' + str_alt + '-' + str(fold) + '.hdf5'
        checkpointer = ModelCheckpoint(filepath=checkpoint_filename, monitor='val_categorical_accuracy', verbose=1, save_best_only=True)
        model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])

        print('\n\tAvaliacao do melhor modelo e registro dos scores...')
        model.load_weights(checkpoint_filename)
        y_val_pred = model.predict_classes(x_val, verbose=1)
        y_val_i = [list(x).index(1) for x in y_val]
        report = classification_report(y_val_i, y_val_pred, target_names=lbArea.classes_, output_dict = True)
        for col in colunas_scores:
            if col == 'accuracy':
                f = report[col]
            else:
                f = report[col]['f1-score']
            df_scores.loc[str_fold,col] = f
        fold += 1
    df_medias[alternativas[alt] + ' mean'] = df_scores.mean()
    df_medias[alternativas[alt] + ' std'] = df_scores.std()
df_medias.T



Alternativa 1 - rede recorrente sobre texto filtrado


Alternativa rede recorrente sobre texto filtrado - Fold 0:
	Definicao de valores de entrada e saida da rede...


W0321 16:14:39.879317 140033094072128 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0321 16:14:39.892505 140033094072128 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0321 16:14:39.894325 140033094072128 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0321 16:14:39.902577 140033094072128 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprec

	Treinamento da rede...


W0321 16:14:40.806172 140033094072128 deprecation.py:506] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0321 16:14:41.203860 140033094072128 deprecation_wrapper.py:119] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0321 16:14:41.302528 140033094072128 deprecation.py:323] From /home/leonardo/anaconda3/envs/gpu/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0,

Train on 11956 samples, validate on 1329 samples
Epoch 1/20

Epoch 00001: val_categorical_accuracy improved from -inf to 0.63055, saving model to weights0-0.hdf5
Epoch 2/20

Epoch 00002: val_categorical_accuracy improved from 0.63055 to 0.75847, saving model to weights0-0.hdf5
Epoch 3/20

Epoch 00003: val_categorical_accuracy improved from 0.75847 to 0.78254, saving model to weights0-0.hdf5
Epoch 4/20

Epoch 00004: val_categorical_accuracy improved from 0.78254 to 0.79458, saving model to weights0-0.hdf5
Epoch 5/20

Epoch 00005: val_categorical_accuracy improved from 0.79458 to 0.80888, saving model to weights0-0.hdf5
Epoch 6/20

Epoch 00006: val_categorical_accuracy improved from 0.80888 to 0.81339, saving model to weights0-0.hdf5
Epoch 7/20

Epoch 00007: val_categorical_accuracy improved from 0.81339 to 0.81490, saving model to weights0-0.hdf5
Epoch 8/20

Epoch 00008: val_categorical_accuracy did not improve from 0.81490
Epoch 9/20

Epoch 00009: val_categorical_accuracy did not impro


Epoch 00011: val_categorical_accuracy did not improve from 0.84951
Epoch 12/20

Epoch 00012: val_categorical_accuracy improved from 0.84951 to 0.85252, saving model to weights0-1.hdf5
Epoch 13/20

Epoch 00013: val_categorical_accuracy did not improve from 0.85252
Epoch 14/20

Epoch 00014: val_categorical_accuracy did not improve from 0.85252
Epoch 15/20

Epoch 00015: val_categorical_accuracy did not improve from 0.85252
Epoch 16/20

Epoch 00016: val_categorical_accuracy did not improve from 0.85252
Epoch 17/20

Epoch 00017: val_categorical_accuracy did not improve from 0.85252
Epoch 18/20

Epoch 00018: val_categorical_accuracy improved from 0.85252 to 0.85403, saving model to weights0-1.hdf5
Epoch 19/20

Epoch 00019: val_categorical_accuracy improved from 0.85403 to 0.85779, saving model to weights0-1.hdf5
Epoch 20/20

Epoch 00020: val_categorical_accuracy did not improve from 0.85779

	Avaliacao do melhor modelo e registro dos scores...


Alternativa rede recorrente sobre texto filtr

In [None]:
df_medias.T.to_csv('scores_excertos_filtrados.csv', encoding = 'Latin1')