# Obtenção e organização dos dados

In [1]:
import pandas as pd

df = pd.read_csv('../dados/jurisprudencia_selecionada_excertos.CSV', sep = ';')[['DESCR_AREA', 'TEXTO_EXCERTO']]
df.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 349, saw 6


In [None]:
df.shape

In [None]:
from sklearn.preprocessing import LabelBinarizer

areas = df.groupby(['DESCR_AREA']).groups.keys()
lbArea = LabelBinarizer()
lbArea.fit([x for x in areas])
y = lbArea.transform(df['DESCR_AREA'])
lbArea.classes_, y.shape

# Pré-processamento

In [None]:
colunas_scores = list(lbArea.classes_)
colunas_scores.extend(['accuracy', 'macro avg', 'weighted avg'])

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

print('\tCarregamento do modelo de acordaos...')
modelo = Word2Vec.load('../vocabularios/modelo-acordaos-50.w2v')

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

df_scores = pd.DataFrame()

limite_texto = 2000
dim_vetor = 100

print('\tTokenizacao e montagem de sequencias...')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['TEXTO_EXCERTO'])
vocabulario = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(df['TEXTO_EXCERTO'])

print('\tMontagem da matriz de embeddings...')
embedding_matrix = np.zeros((vocabulario, dim_vetor))
for word, i in tokenizer.word_index.items():
    if word in modelo.wv:
        embedding_matrix[i] = modelo.wv[word]

# Treinamentos com Cross-validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Flatten, GRU
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint

fold = 0
for train_index, val_index in KFold(n_splits=10, random_state=42, shuffle=True).split(df):
    str_fold = 'Fold ' + str(fold)
    print()
    print()
    print(str_fold + ':')
    print('\tDefinicao de valores de entrada e saida da rede...')

    df_train = df.loc[train_index]
    df_val = df.loc[val_index]

    sequences_train = tokenizer.texts_to_sequences(df_train['TEXTO_EXCERTO'])
    sequences_val = tokenizer.texts_to_sequences(df_val['TEXTO_EXCERTO'])

    x_train = pad_sequences(sequences_train, maxlen=limite_texto)
    x_val = pad_sequences(sequences_val, maxlen=limite_texto)

    y_train = lbArea.transform(df_train['DESCR_AREA'])
    y_val = lbArea.transform(df_val['DESCR_AREA'])

    print('\tTreinamento da rede...')
    model = Sequential()
    model.add(Embedding(vocabulario, dim_vetor, input_length=limite_texto, trainable=True,  weights=[embedding_matrix]))
    model.add(Conv1D(32, 7, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Conv1D(32, 7, activation='relu'))
    model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),  metrics=['categorical_accuracy'])

    checkpoint_filename = 'weights' + str(fold) + '.hdf5'
    checkpointer = ModelCheckpoint(filepath=checkpoint_filename, monitor='val_categorical_accuracy', verbose=1, save_best_only=True)
    model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val), verbose=1, shuffle=False, callbacks=[checkpointer])

    print('\n\tAvaliacao do melhor modelo and registro dos scores...')
    model.load_weights(checkpoint_filename)
    y_val_pred = model.predict_classes(x_val, verbose=1)
    y_val_i = [list(x).index(1) for x in y_val]
    report = classification_report(y_val_i, y_val_pred, target_names=lbArea.classes_, output_dict = True)
    for col in colunas_scores:
        if col == 'accuracy':
            f = report[col]
        else:
            f = report[col]['f1-score']
        df_scores.loc[str_fold,col] = f
    fold += 1

# Exportação dos resultados

In [None]:
df_medias = pd.DataFrame()
df_medias['mean'] = df_scores.mean()
df_medias['std'] = df_scores.std()
df_medias.T

In [None]:
df_medias.T.to_csv('scores_excerto_original.csv', encoding = 'Latin1')