In [1]:
import pandas as pd

df = pd.read_csv('../dados/jurisprudencia_selecionada_enunciados.csv', sep = '|')
df.head()

Unnamed: 0,COD,NUM_ENUNCIADO,COD_AREA,DESCR_AREA,COD_TEMA,DESCR_TEMA,COD_SUBTEMA,DESCR_SUBTEMA,COD_DOC_TRAMITAVEL_ENUNCIADO,TEXTO_ENUNCIADO,ACORDAO,TIPO_PROCESSO
0,1400,1236,50,Responsabilidade,488,Solidariedade,261,Benefício previdenciário,54995437,Não comprovada a participação do beneficiário ...,Acórdão 297/2016 - PL,Tomada de Contas Especial
1,1700,1534,46,Finanças Públicas,981,Exportação,983,Petróleo,55025587,A operação ficta de exportação de plataformas ...,Acórdão 366/2016 - PL,Solicitação do Congresso Nacional
2,5700,5314,50,Responsabilidade,203,Multa,1021,Dosimetria,55455370,"No âmbito do TCU, a dosimetria da pena tem com...",Acórdão 944/2016 - PL,Acompanhamento
3,284,40,45,Direito Processual,162,Princípio da independência das instâncias,481,Decisão judicial,54773746,O princípio da independência das instâncias pe...,Acórdão 30/2016 - PL,Tomada de Contas Especial
4,298,54,49,Pessoal,141,Sistema S,142,Nepotismo,54773402,É vedado aos dirigentes das entidades do Siste...,Acórdão 55/2016 - PL,Representação


In [2]:
df.shape

(13312, 12)

In [3]:
areas = df.groupby(['DESCR_AREA']).groups.keys()
areas

dict_keys(['Competência do TCU', 'Contrato Administrativo', 'Convênio', 'Desestatização', 'Direito Processual', 'Finanças Públicas', 'Gestão Administrativa', 'Licitação', 'Pessoal', 'Responsabilidade'])

In [4]:
from sklearn.preprocessing import LabelBinarizer

lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
lbArea.classes_

array(['Competência do TCU', 'Contrato Administrativo', 'Convênio',
       'Desestatização', 'Direito Processual', 'Finanças Públicas',
       'Gestão Administrativa', 'Licitação', 'Pessoal',
       'Responsabilidade'], dtype='<U23')

In [5]:
y = lbArea.transform(df['DESCR_AREA'])
y.shape

(13312, 10)

In [28]:
from keras.preprocessing.text import Tokenizer
import numpy as np

vocabulario = 20000
limite_texto = 200
dim_vetor = 100

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['TEXTO_ENUNCIADO'])
sequences = tokenizer.texts_to_sequences(df['TEXTO_ENUNCIADO'])

In [29]:
from keras.preprocessing.sequence import pad_sequences

x = pad_sequences(sequences, maxlen=limite_texto)

print('Shape of data tensor:', x.shape)

Shape of data tensor: (13312, 200)


In [None]:
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint

fold=0
scores = []
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    print()
    print()
    print('=======================================================================================================')
    print('===             TREINAMENTO GRU (dropout .2) COM EMBEDDING SEM PRÉ-TREINO - FOLD', fold, '/ 10             ===')
    print('=======================================================================================================')
    print()
    df_train = df.loc[train_index]
    df_val = df.loc[val_index]
    
    sequences_train = tokenizer.texts_to_sequences(df_train['TEXTO_ENUNCIADO'])
    sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_ENUNCIADO'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)
    
    y_train = lbArea.transform(df_train['DESCR_AREA'])
    y_val = lbArea.transform(df_val['DESCR_AREA'])

    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, input_length=x.shape[1]))
    model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',  metrics=['categorical_accuracy'])

    checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', monitor='val_categorical_accuracy', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])
    
    print('Evaluating best model and registering score:')
    
    model.load_weights('/tmp/weights.hdf5')
    score = model.evaluate(x_val, y_val)
    print(model.metrics_names[1], '=', score[1])
    scores.append(score[1])
    
    fold += 1
score



===             TREINAMENTO GRU (dropout .2) COM EMBEDDING SEM PRÉ-TREINO - FOLD 0 / 10             ===

Train on 11980 samples, validate on 1332 samples
Epoch 1/5

Epoch 00001: val_categorical_accuracy improved from -inf to 0.48574, saving model to /tmp/weights.hdf5
Epoch 2/5

Epoch 00002: val_categorical_accuracy improved from 0.48574 to 0.59084, saving model to /tmp/weights.hdf5
Epoch 3/5

In [None]:
df = pd.Dataframe(data = score, columns = [['sem pré-treino']])

In [None]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('../externos/model.txt')

In [None]:
from keras.layers import Embedding

fold=0
score = []
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    print()
    print()
    print('=======================================================================================================')
    print('===            TREINAMENTO GRU (dropout .2) COM EMBEDDING TREINO NILC FIXO - FOLD', fold, '/ 10         ===')
    print('=======================================================================================================')
    print()
    df_train = df.loc[train_index]
    df_val = df.loc[val_index]
    
    sequences_train = tokenizer.texts_to_sequences(df_train['TEXTO_ENUNCIADO'])
    sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_ENUNCIADO'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)
    
    y_train = lbArea.transform(df_train['DESCR_AREA'])
    y_val = lbArea.transform(df_val['DESCR_AREA'])

    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=False))
    model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',  metrics=['categorical_accuracy'])

    checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])
    model.load_weights('/tmp/weights.hdf5')
    score.append(model.evaluate(x_val, y_val))
    
    fold += 1
score

In [None]:
df['NILC fixo'] = score

In [None]:
from keras.layers import Embedding

fold=0
score = []
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    print()
    print()
    print('=======================================================================================================')
    print('===          TREINAMENTO GRU (dropout .2) COM EMBEDDING TREINO NILC VARIAVEL - FOLD', fold, '/ 10       ===')
    print('=======================================================================================================')
    print()
    df_train = df.loc[train_index]
    df_val = df.loc[val_index]
    
    sequences_train = tokenizer.texts_to_sequences(df_train['TEXTO_ENUNCIADO'])
    sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_ENUNCIADO'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)
    
    y_train = lbArea.transform(df_train['DESCR_AREA'])
    y_val = lbArea.transform(df_val['DESCR_AREA'])

    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
    model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',  metrics=['categorical_accuracy'])

    checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])
    model.load_weights('/tmp/weights.hdf5')
    score.append(model.evaluate(x_val, y_val))
    
    fold += 1
score

In [None]:
df['NILC variavel'] = score

In [None]:
model = Word2Vec.load('../vocabularios/modelo-acordaos2.w2v')

In [None]:
from keras.layers import Embedding

fold=0
score = []
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    print()
    print()
    print('=======================================================================================================')
    print('===          TREINAMENTO GRU (dropout .2) COM EMBEDDING TREINO ACORDAOS FIXO - FOLD', fold, '/ 10       ===')
    print('=======================================================================================================')
    print()
    df_train = df.loc[train_index]
    df_val = df.loc[val_index]
    
    sequences_train = tokenizer.texts_to_sequences(df_train['TEXTO_ENUNCIADO'])
    sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_ENUNCIADO'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)
    
    y_train = lbArea.transform(df_train['DESCR_AREA'])
    y_val = lbArea.transform(df_val['DESCR_AREA'])

    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=False))
    model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',  metrics=['categorical_accuracy'])

    checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])
    model.load_weights('/tmp/weights.hdf5')
    score.append(model.evaluate(x_val, y_val))
    
    fold += 1
score

In [None]:
df['Acordaos fixo'] = score

In [None]:
from keras.layers import Embedding

fold=0
score = []
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    print()
    print()
    print('=======================================================================================================')
    print('===     TREINAMENTO GRU (dropout .2) COM EMBEDDING TREINO ACORDAOS VARIAVEL - FOLD', fold, '/ 10     ===')
    print('=======================================================================================================')
    print()
    df_train = df.loc[train_index]
    df_val = df.loc[val_index]
    
    sequences_train = tokenizer.texts_to_sequences(df_train['TEXTO_ENUNCIADO'])
    sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_ENUNCIADO'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)
    
    y_train = lbArea.transform(df_train['DESCR_AREA'])
    y_val = lbArea.transform(df_val['DESCR_AREA'])

    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, weights=[embedding_matrix], input_length=limite_texto, trainable=True))
    model.add(GRU(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',  metrics=['categorical_accuracy'])

    checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])
    model.load_weights('/tmp/weights.hdf5')
    score.append(model.evaluate(x_val, y_val))
    
    fold += 1
score

In [None]:
df['Acordao variavel'] = score

In [None]:
df

In [None]:
df.describe()