In [1]:
import pandas as pd

df = pd.read_csv('../dados/jurisprudencia_selecionada_enunciados.csv', sep = '|')
df.head()

Unnamed: 0,COD,NUM_ENUNCIADO,COD_AREA,DESCR_AREA,COD_TEMA,DESCR_TEMA,COD_SUBTEMA,DESCR_SUBTEMA,COD_DOC_TRAMITAVEL_ENUNCIADO,TEXTO_ENUNCIADO,ACORDAO,TIPO_PROCESSO
0,1400,1236,50,Responsabilidade,488,Solidariedade,261,Benefício previdenciário,54995437,Não comprovada a participação do beneficiário ...,Acórdão 297/2016 - PL,Tomada de Contas Especial
1,1700,1534,46,Finanças Públicas,981,Exportação,983,Petróleo,55025587,A operação ficta de exportação de plataformas ...,Acórdão 366/2016 - PL,Solicitação do Congresso Nacional
2,5700,5314,50,Responsabilidade,203,Multa,1021,Dosimetria,55455370,"No âmbito do TCU, a dosimetria da pena tem com...",Acórdão 944/2016 - PL,Acompanhamento
3,284,40,45,Direito Processual,162,Princípio da independência das instâncias,481,Decisão judicial,54773746,O princípio da independência das instâncias pe...,Acórdão 30/2016 - PL,Tomada de Contas Especial
4,298,54,49,Pessoal,141,Sistema S,142,Nepotismo,54773402,É vedado aos dirigentes das entidades do Siste...,Acórdão 55/2016 - PL,Representação


In [2]:
df.shape

(13312, 12)

In [3]:
areas = df.groupby(['DESCR_AREA']).groups.keys()
areas

dict_keys(['Competência do TCU', 'Contrato Administrativo', 'Convênio', 'Desestatização', 'Direito Processual', 'Finanças Públicas', 'Gestão Administrativa', 'Licitação', 'Pessoal', 'Responsabilidade'])

In [4]:
from sklearn.preprocessing import LabelBinarizer

lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
lbArea.classes_

array(['Competência do TCU', 'Contrato Administrativo', 'Convênio',
       'Desestatização', 'Direito Processual', 'Finanças Públicas',
       'Gestão Administrativa', 'Licitação', 'Pessoal',
       'Responsabilidade'], dtype='<U23')

In [5]:
y = lbArea.transform(df['DESCR_AREA'])
y.shape

(13312, 10)

In [6]:
from keras.preprocessing.text import Tokenizer
import numpy as np

vocabulario = 20000
limite_texto = 200
dim_vetor = 100

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['TEXTO_ENUNCIADO'])
vocabulario = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(df['TEXTO_ENUNCIADO'])

Using TensorFlow backend.


In [7]:
from keras.preprocessing.sequence import pad_sequences

x = pad_sequences(sequences, maxlen=limite_texto)

print('Shape of data tensor:', x.shape)

Shape of data tensor: (13312, 200)


In [8]:
colunas_scores = list(lbArea.classes_)
colunas_scores.extend(['accuracy', 'macro avg', 'weighted avg'])

In [30]:
alternativas = [None, 'sem pré-treino', 'NILC fixo', 'NILC variável', 'Acordãos fixo', 'Acordãos variável']

In [26]:
from keras.layers import Embedding
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

embeddings = [None]

print('Montando embedding sem pré-treino.')
embeddings.append(Embedding(vocabulario, dim_vetor, input_length=x.shape[1]))

print('Montando embeddings NILC:')
model_nilc = KeyedVectors.load_word2vec_format('../externos/model.txt')
embedding_matrix = np.zeros((vocabulario, dim_vetor))
ok = 0
for word, i in tokenizer.word_index.items():
    if word in model_nilc:
        embedding_matrix[i] = model_nilc[word]
        ok += 1
print('\tVocabulario:', i, ' - encontrados no modelo:', ok, '=', ok * 100. / i)
embeddings.append(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=False))
embeddings.append(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))

print('Montando embeddings Acordaos:')
model_acordaos = Word2Vec.load('../vocabularios/modelo-acordaos.w2v')
embedding_matrix = np.zeros((vocabulario, dim_vetor))
ok = 0
for word, i in tokenizer.word_index.items():
    if word in model_acordaos.wv:
        embedding_matrix[i] = model_acordaos.wv[word]
        ok += 1
print('\tVocabulario:', i, ' - encontrados no modelo:', ok, '=', ok * 100. / i)
embeddings.append(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=False))
embeddings.append(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
embeddings

Montando embedding vazio
Montando embeddings NILC:


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


	Vocabulario: 15387  - encontrados no modelo: 13758 = 89.41314096315071
Montando embeddings Acordaos:
	Vocabulario: 15387  - encontrados no modelo: 14572 = 94.70332098524729


[None,
 <keras.layers.embeddings.Embedding at 0x7fa7b86f1ad0>,
 <keras.layers.embeddings.Embedding at 0x7fa4a289d890>,
 <keras.layers.embeddings.Embedding at 0x7fa4777b5a10>,
 <keras.layers.embeddings.Embedding at 0x7fa46cff1990>,
 <keras.layers.embeddings.Embedding at 0x7fa46cf75110>]

In [31]:
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense, GRU
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint

print('Realizando predições, calculando métricas F1 e médias entre folds:')
print()
df_medias = pd.DataFrame()
for alt in range(1, 6):
    str_alt = str(alt)
    fold = 0
    df_scores = pd.DataFrame()
    for _, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
        str_fold = 'Fold ' + str(fold)
        print('Alternativa ' + alternativas[alt], '-', str_fold + ':')
        df_val = df.loc[val_index]
        sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_ENUNCIADO'])
        x_val = pad_sequences(sequences_val, maxlen=limite_texto)
        y_val = lbArea.transform(df_val['DESCR_AREA'])

        model = Sequential()
        model.add(embeddings[alt])
        model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=['categorical_accuracy'])

        checkpoint_filename = '../rascunho/weights' + str_alt + '-' + str(fold) + '.hdf5'
        model.load_weights(checkpoint_filename)
        y_val_pred = model.predict_classes(x_val, verbose=1)
        y_val_i = [list(x).index(1) for x in y_val]
        report = classification_report(y_val_i, y_val_pred, target_names=lbArea.classes_, output_dict = True)
        for col in colunas_scores:
            if col == 'accuracy':
                f = report[col]
            else:
                f = report[col]['f1-score']
            df_scores.loc[str_fold,col] = f
        fold += 1
    df_medias[alternativas[alt] + ' mean'] = df_scores.mean()
    df_medias[alternativas[alt] + ' std'] = df_scores.std()
df_medias.T

Realizando predições e calculando métricas F1:

Alternativa sem pré-treino - Fold 0:
Alternativa sem pré-treino - Fold 1:
Alternativa sem pré-treino - Fold 2:
Alternativa sem pré-treino - Fold 3:
Alternativa sem pré-treino - Fold 4:
Alternativa sem pré-treino - Fold 5:
Alternativa sem pré-treino - Fold 6:
Alternativa sem pré-treino - Fold 7:
Alternativa sem pré-treino - Fold 8:
Alternativa sem pré-treino - Fold 9:
Alternativa NILC fixo - Fold 0:
Alternativa NILC fixo - Fold 1:
Alternativa NILC fixo - Fold 2:
Alternativa NILC fixo - Fold 3:
Alternativa NILC fixo - Fold 4:
Alternativa NILC fixo - Fold 5:
Alternativa NILC fixo - Fold 6:
Alternativa NILC fixo - Fold 7:
Alternativa NILC fixo - Fold 8:
Alternativa NILC fixo - Fold 9:
Alternativa NILC variável - Fold 0:
Alternativa NILC variável - Fold 1:
Alternativa NILC variável - Fold 2:
Alternativa NILC variável - Fold 3:
Alternativa NILC variável - Fold 4:
Alternativa NILC variável - Fold 5:
Alternativa NILC variável - Fold 6:
Alternativ

Unnamed: 0,Competência do TCU,Contrato Administrativo,Convênio,Desestatização,Direito Processual,Finanças Públicas,Gestão Administrativa,Licitação,Pessoal,Responsabilidade,accuracy,macro avg,weighted avg
sem pré-treino mean,0.797666,0.747523,0.720248,0.52728,0.877759,0.708679,0.628035,0.897458,0.958174,0.85136,0.865084,0.771418,0.863183
sem pré-treino std,0.037814,0.035921,0.02763,0.116096,0.011972,0.09145,0.077369,0.019779,0.008048,0.02166,0.005609,0.02384,0.006723
NILC fixo mean,0.821883,0.773545,0.733745,0.714096,0.874737,0.719741,0.690521,0.905885,0.956254,0.860908,0.873799,0.805132,0.872855
NILC fixo std,0.030927,0.021191,0.039074,0.095857,0.019334,0.052992,0.100885,0.009339,0.008996,0.013273,0.007443,0.018528,0.00751
NILC variável mean,0.841308,0.827621,0.785071,0.754824,0.897497,0.778078,0.720142,0.92817,0.968839,0.882955,0.898063,0.83845,0.897843
NILC variável std,0.032815,0.018414,0.033271,0.124064,0.017092,0.040571,0.064021,0.010439,0.005044,0.01566,0.006115,0.018102,0.005836
Acordãos fixo mean,0.830313,0.80497,0.786595,0.766981,0.894726,0.750545,0.705421,0.922781,0.96692,0.88228,0.893255,0.831153,0.89261
Acordãos fixo std,0.03659,0.030481,0.041455,0.108615,0.017459,0.045065,0.067295,0.017965,0.008324,0.014056,0.008438,0.018611,0.00851
Acordãos variável mean,0.842329,0.835885,0.783561,0.776252,0.897609,0.789944,0.725477,0.927059,0.969482,0.887001,0.900242,0.84346,0.899613
Acordãos variável std,0.046628,0.025838,0.032194,0.112356,0.012126,0.050179,0.075378,0.011686,0.007539,0.015987,0.00745,0.02231,0.007619
