In [22]:
import sys
sys.path.append('../src')
from logger import log, debug
from cleaner import clean_corpus_basic, clean_corpus_standford
from reader import read_files
from lstm_utils import get_tokenizer, get_best_tokens_dummy

from sklearn.utils import shuffle

from keras.layers import Dense, LSTM
from keras.models import Model, Sequential

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd



In [23]:
# -------------- Hiperparametros -----------------

vector_size = 8
each_q = 128
batch_size = 2000
epoch = 256
min_lenght, max_length = 4, 8
train_to_test = 0.8

In [24]:
# Leemos los archivos
corpus = clean_corpus_basic(read_files("../dataset/", ["negative", "positive"]))

# Filtramos por longitud
debug("[El corpus tiene " + str(len(corpus)) + " rows]")

#corpus["length"] = corpus["content"].map(lambda x: len(x.split(" ")))
#corpus = corpus[(corpus["length"] >= min_lenght) & (corpus["length"] <= max_length)]

debug("[Luego de filtrar el corpus tiene " + str(len(corpus)) + " rows]")

#Shuffleamos el corpus
corpus = shuffle(corpus)
corpus.reset_index(inplace=True, drop=True)



[Leyendo archivos en panda...]
[Archivos Leidos...]
[Usando cleaner basico]
[Limpiando el corpus...]
[Usando 8 threads ...]
[El corpus tiene 20000 rows]
[Luego de filtrar el corpus tiene 20000 rows]


In [28]:
# Seleccionamos los mejores tokens
best_tokens = get_best_tokens_dummy(corpus, each_q)
max_features = best_tokens.size

tokenizer = get_tokenizer(best_tokens, vector_size)

# Vectorizamos
debug("[Vectorizando corpus... ]")
corpus['vector'] = tokenizer(corpus['content'])

corpus.head()

[Vectorizando corpus... ]


Unnamed: 0,content,sentiment,raw,rate,vector
0,MAL MAL CALID NO PENS ERAN ASI,"[0, 1]",Malo Es muy mala calidad. No pensé que eran asi,negative,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,MAL VIEN CABL NO VOLVERI COMPR HABI C0MPRAD0 O...,"[0, 1]",malo Viene sin cable no la volveria a compran ...,negative,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,EXCELENT BUEN CUMPL ESPECT,"[1, 0]","Excelente Muy buena, cumple con las espectativas",positive,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,PORQU RELOJ VENDEDOR MAL ECHO ESTE FALL PERMAN...,"[0, 1]","Una porquería, el reloj y el vendedor. Es muy ...",negative,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,INRESPONS HIC PREGUNT CUAL ERA MATERIAL TAP AF...,"[0, 1]",Inresponsable Hice la pregunta de cual era el ...,negative,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [30]:



best_tokens.to_pickle("best_tokens.bin")



In [31]:
# Dividimos el corpus

x_total = np.stack(corpus["vector"].values)
y_total = np.stack(corpus['sentiment'].values)

x_train, x_test = np.split(x_total, [int(train_to_test * len(x_total))])
y_train, y_test = np.split(y_total, [int(train_to_test * len(x_total))])

log("[Son " + str(len(x_train)) + " train sequences]")
log("[Son " + str(len(x_test)) + " test sequences]")



[Son 16000 train sequences]
[Son 4000 test sequences]


In [32]:
# Calculamos la cantidad de neuronas con las que vamos a procesar

lstm_neurons = []#[int((2/3) * (2 + max_features))]

for a in [4]:#range(2, 10 + 1):
    dem = a * (2 + max_features )
    lstm_neurons.append(int(len(x_train) / dem))

lstm_neurons = list( dict.fromkeys(lstm_neurons) )

log("[Las neuronas van a ser: " + str(lstm_neurons) + "]")

[Las neuronas van a ser: [21]]


In [33]:
#Entrenamos modelo y testeamos

lstm_results = []
for neurons in lstm_neurons: 
    log("-----------------------")
    log("[Usando " + str(neurons) + " neuronas]")

    #Buildemos modelo
    log("[Buildeando modelo... ]")

    model = Sequential()
    model.add(LSTM(neurons, dropout=0.2, input_shape=(vector_size, max_features)))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    log("[Modelo buildeado]")

    # Fitteamos
    log("[Fiteando modelo... ]")
    history = model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epoch,  
              validation_data=(x_test, y_test),
                verbose=0)
    
    
    #Testeamos
    log("[Testeando modelo... ]")
    score, acc = model.evaluate(x_test, y_test,
                                batch_size=batch_size,
                               verbose=0)
    
    log("[   Score: " + str(score))
    log("[   Accuaracy: " + str(acc))
    
    lstm_results.append((neurons, score, acc, history))
model.save('model.h5')

-----------------------
[Usando 21 neuronas]
[Buildeando modelo... ]
[Modelo buildeado]
[Fiteando modelo... ]
[Testeando modelo... ]
[   Score: 0.15865857154130936
[   Accuaracy: 0.949999988079071
