# Criação e treinamento do modelo de classificação

## Imports e definição de funções

In [1]:
import os
import pickle
import sklearn
import random
import nltk
import numpy as np
import tflearn
import tensorflow as tf
from sklearn.model_selection import train_test_split
from collections import Counter
# Função para salvar uma lista em um binário
def save_list_to_file(list_to_save,file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(list_to_save, f)
# Função para carregar uma lista 
def load_list_from_file(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)     
print ('Importações realizadas com sucesso')

Importações realizadas com sucesso


In [2]:
class EarlyStoppingCallback(tflearn.callbacks.Callback):
    def __init__(self, val_epoch_thresh, val_acc_thresh):
        self.val_epoch_thresh = val_epoch_thresh
        self.val_acc_thresh = val_acc_thresh
    def on_epoch_end(self, training_state):
        print("Epoch ", training_state.epoch, " with Accuracy ", training_state.acc_value)
        if training_state.epoch >= self.val_epoch_thresh and training_state.acc_value >= self.val_acc_thresh:
            raise StopIteration
    def on_train_end(self, training_state):
        print("Successfully left training! Final model accuracy:", training_state.acc_value)

## Parâmetros de funcionamento

In [13]:
DATASET = 'ds4'
LIMITE = 20000
STEPS = 1000
REDE = 'net1'
BATCH = 10
MIN_EPOCHS = 25
MIN_ACCURACY = 0.60
print ('Parâmetros setados')

Parâmetros setados


## Carga dos arquivos frutos do pré-processamento

In [9]:
categories = load_list_from_file('categories.pickle')
all_ngrams = load_list_from_file('all_ngrams.pickle')
docs = load_list_from_file('docs.pickle')
print ('\nCarregou ',len(categories),' categorias: ',categories)
print ('\nCarregou ',len(docs),' Docs (primeiro): ',docs[:1])
# Calcula frequências e remove duplicidade
freq = Counter(all_ngrams)
all_ngrams = [i[0] for i in freq.most_common(LIMITE)]
all_ngrams = sorted(list(set(all_ngrams)))
print ('\nLimitando para apenas ',len(all_ngrams),' n-grams mais frequentes.')
print ('\nCarregou ',len(all_ngrams),' N-Grams (primeiros): ',all_ngrams[:200])


Carregou  11  categorias:  ['atx_precricao', 'atx_decadencia', 'atx_jurisprudencia', 'atx_duplavisita', 'atx_criteriojuridico', 'atx_atenuacao', 'atx_principios', 'atx_retroatividade', 'atx_nulidade', 'atx_intimprevia', 'atx_denuncia_espontanea']

Carregou  1577  Docs (primeiro):  [([('empres', 'acim'), ('acim', 'entregou'), ('entregou', 'espontanea'), ('espontanea', 'atraso'), ('atraso', 'gfip'), ('gfip', 'competência'), ('competência', '2010'), ('2010', '2010'), ('2010', 'receit'), ('receit', 'brasil'), ('brasil', 'multou'), ('multou', 'mesm'), ('mesm', 'const'), ('const', 'auto'), ('auto', 'infração'), ('infração', 'pedindo'), ('pedindo', 'impugnação'), ('impugnação', 'pagamento'), ('pagamento', 'desconto'), ('desconto', 'mesm'), ('mesm', 'trint'), ('trint', 'direito'), ('direito', 'prelimin'), ('prelimin', 'viemo'), ('viemo', 'atravé'), ('atravé', 'dest'), ('dest', 'solicit'), ('solicit', 'impugnação'), ('impugnação', 'auto'), ('auto', 'infração'), ('infração', 'declaração'), ('de

## Criação do bag of n-grams para cada documento

In [5]:
# Cria as listas para os dados de treino e teste
training_test = []
count = 0
print('Iniciando montagem de',len(docs),'documentos')
for doc in docs:
    count += 1
    if count % 100 == 0:
        print('   processando documento',count)
    # Inicializa o bag of words para cada documento da lista
    bow = []
    out = []
    # Lista de palavras tokenizadas
    ngrs = doc[0]
    # Cria um array com o bag of words
    for w in all_ngrams:
        bow.append(1) if w in ngrs else bow.append(0)
    # Lista de categorias
    cats = doc[1]
    txt = doc[2]
    for c in categories:
        out.append(1) if c in cats else out.append(0)
    # Nosso conjunto de treinamento conterá um modelo bag of words e a linha de saída que informa a qual sentença pertence.
    training_test.append([bow, out, txt])    
print('Criou ',len(training_test),' dados para treinamento e teste')
training_test_backup = training_test

Iniciando montagem de 1577 documentos
   processando documento 100
   processando documento 200
   processando documento 300
   processando documento 400
   processando documento 500
   processando documento 600
   processando documento 700
   processando documento 800
   processando documento 900
   processando documento 1000
   processando documento 1100
   processando documento 1200
   processando documento 1300
   processando documento 1400
   processando documento 1500
Criou  1577  dados para treinamento e teste


## Separação dos dados de treinamento e teste

In [6]:
# Embaralha e transforma em np.array enquanto o TensorFlow recebe uma matriz numérica
random.shuffle(training_test)
training_test = np.array(training_test)
# trainX contém o bag of words e train_y contém os labels/categorias
X = list(training_test[:, 0])
y = list(training_test[:, 1])
z = list(training_test[:, 2])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(' Train data: X_train(',len(X_train),'), y_train(',len(y_train),')')
print(' Test data: X_test(',len(X_test),'), y_test(',len(y_test),')')
print('Exemplo de X_train: ',X_train[0][:100],'...')
print('Exemplo de y_train: ',y_train[0])

 Train data: X_train( 1056 ), y_train( 1056 )
 Test data: X_test( 521 ), y_test( 521 )
Exemplo de X_train:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ...
Exemplo de y_train:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]


## Criação de uma Rede Neural DNN para ser treinada

In [23]:
# Reset do grafo
tf.reset_default_graph()
# Cria a rede neural
net = None
if REDE == 'net1':
    net = tflearn.input_data(shape=[None, len(X_train[0])])
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, len(y_train[0]), activation='sigmoid')
    net = tflearn.regression(net)
elif REDE == 'net2':
    net = tflearn.input_data(shape=[None, len(X_train[0])])
    net = tflearn.fully_connected(net, 16)
    net = tflearn.fully_connected(net, 16)
    net = tflearn.fully_connected(net, len(y_train[0]), activation='sigmoid')
    net = tflearn.regression(net)
elif REDE == 'net3':
    net = tflearn.input_data(shape=[None, len(X_train[0])])
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, len(y_train[0]), activation='sigmoid')
    net = tflearn.regression(net)
elif REDE == 'net4':
    net = tflearn.input_data(shape=[None, len(X_train[0])])
    net = tflearn.fully_connected(net, 100)
    net = tflearn.fully_connected(net, 50)
    net = tflearn.fully_connected(net, 25)
    net = tflearn.fully_connected(net, len(y_train[0]), activation='sigmoid')
    net = tflearn.regression(net)
elif REDE == 'net5':
    net = tflearn.input_data(shape=[None, len(X_train[0])])
    net = tflearn.embedding(net, input_dim=len(X_train[0]), output_dim=len(y_train[0]))
    net = tflearn.lstm(net, 8)
    net = tflearn.dropout(net, 0.8)
    net = tflearn.fully_connected(net, len(y_train[0]), activation='sigmoid')
    net = tflearn.regression(net, optimizer='adam',loss='binary_crossentropy')
elif REDE == 'net6':
    net = tflearn.input_data(shape=[None, len(X_train[0])])
    net = tflearn.embedding(net, input_dim=len(X_train[0]), output_dim=len(y_train[0]))
    net = tflearn.lstm(net, 8)
    net = tflearn.dropout(net, 0.8)
    net = tflearn.fully_connected(net, len(y_train[0]), activation='sigmoid')
    net = tflearn.regression(net, optimizer='adam',loss='binary_crossentropy')
# Define o modelo e configura o tensorboard
print ('Criada a',REDE)
model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='/home/03662232677/NLP/RFB/tflearn_logs_'+str(DATASET)+'/'+str(DATASET)+'_n'+str(LIMITE)+'_st'+str(STEPS)+'_'+str(REDE)+'_b'+str(BATCH))

Criada a net1


## Treinamento da Rede Neural com os dados de treino

In [32]:
# Initialize our callback with desired accuracy threshold.  
early_stopping_cb = EarlyStoppingCallback(val_epoch_thresh=MIN_EPOCHS,val_acc_thresh=0.6)
# Treinamento
try:
    model.fit(X_train, y_train, n_epoch = STEPS, batch_size = BATCH, show_metric = True, callbacks=early_stopping_cb)
except StopIteration:
    print("Caught callback exception. Returning control to user program.")

Training Step: 22789  | total loss: [1m[32m6.98164[0m[0m | time: 1.195s
| Adam | epoch: 215 | loss: 6.98164 - acc: 0.6468 -- iter: 1050/1056
Training Step: 22790  | total loss: [1m[32m6.74398[0m[0m | time: 1.206s
| Adam | epoch: 215 | loss: 6.74398 - acc: 0.6621 -- iter: 1056/1056
--
Epoch  215  with Accuracy  0.6621267199516296
Successfully left training! Final model accuracy: 0.6621267199516296
Caught callback exception. Returning control to user program.


## Calculando a acurácia com dados de teste

In [35]:
# Testando o modelo com a base de testes
total = 0
acertos = 0
for x, y in zip(X_test, y_test):
    result = np.rint(model.predict([x]))[0]
    if result.astype(int).tolist() == y:
       acertos += 1 
    total += 1
print("Acurácia para %s dados de teste: %s" % (total, str(acertos/total)))  # only show first 3 probas

Acurácia para 521 dados de teste: 0.5911708253358925


## Calculando a acurácia com dados de treino

In [36]:
# Testando o modelo com a base de testes
total_train = 0
acertos_train = 0
for x, y in zip(X_train, y_train):
    result = np.rint(model.predict([x]))[0]
    if result.astype(int).tolist() == y:
       acertos_train += 1 
    total_train += 1
print("Acurácia para %s dados de treino: %s" % (total_train, str(acertos_train/total_train)))  # only show first 3 probas

Acurácia para 1056 dados de treino: 0.8115530303030303


## Calculando a acurácia com todos os dados

In [27]:
print("Acurácia para %s dados de treino: %s" % 
      ((total+total_train), str((acertos+acertos_train)/(total+total_train))))  # only show first 3 probas

Acurácia para 1577 dados de treino: 0.7577679137603044


## Realizando um teste de um documento

In [None]:
N = 1
#print(X_test[N])
print(np.rint(model.predict([X_test[N]]))[0])
print(y_test[N])

## Salvando um modelo treinado

In [None]:
# Save model and pickle files
import os
model_name = 'model_'+str(DATASET)+'_n'+str(LIMITE)+'_st'+str(STEPS)+'_'+str(REDE)+'_b'+str(BATCH)
directory = '/home/03662232677/NLP/RFB/'+model_name
if not os.path.exists(directory):
    os.mkdir(directory) 
model.save(directory+'/'+model_name+'.tflearn')
from shutil import copyfile
copyfile('categories.pickle',directory+'/categories.pickle')
save_list_to_file(all_ngrams,directory+'/all_ngrams.pickle')
copyfile('docs.pickle',directory+'/docs.pickle')
save_list_to_file(training_test_backup,directory+'/training_test_backup.pickle')
save_list_to_file(training_test,directory+'/training_test.pickle')
save_list_to_file(X_train,directory+'/X_train.pickle')
save_list_to_file(X_test,directory+'/X_test.pickle')
save_list_to_file(y_train,directory+'/y_train.pickle')
save_list_to_file(y_test,directory+'/y_test.pickle')

In [None]:
# fim