# Preprocesamiento de los datos
#### http://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
#Eliminacion todos los caracteres especiales o simbolos.
import os
import pandas as pd
import re
from tensorflow import keras
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')

def get_data_txt(file_path):
    _file = open(file_path,'r')
    data = _file.read()
    symbols = re.compile(r'[!"#$%&\()*+,-./:;<=>?@\[\]\\^_`{|}~]')
    clean_data = symbols.sub('', data)
    return clean_data

[nltk_data] Downloading package punkt to /home/jsanch90/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Clase para graficar los datos de entrenamiento y validacion en una sola grafica en TensorBoard
import os
import tensorflow as tf
from keras.callbacks import TensorBoard

class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./logs', **kwargs):
        # Make the original `TensorBoard` log to a subdirectory 'training'
        training_log_dir = os.path.join(log_dir, 'training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

        # Log the validation metrics to a separate subdirectory
        self.val_log_dir = os.path.join(log_dir, 'validation')

    def set_model(self, model):
        # Setup writer for validation metrics
        self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def on_epoch_end(self, epoch, logs=None):
        # Pop the validation logs and handle them separately with
        # `self.val_writer`. Also rename the keys so that they can
        # be plotted on the same figure with the training metrics
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if k.startswith('val_')}
        for name, value in val_logs.items():
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value.item()
            summary_value.tag = name
            self.val_writer.add_summary(summary, epoch)
        self.val_writer.flush()

        # Pass the remaining logs to `TensorBoard.on_epoch_end`
        logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
        super(TrainValTensorBoard, self).on_epoch_end(epoch, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()

Using TensorFlow backend.


In [None]:
#funcion para leer un directorio con varios archivos archivos de comentarios y retornar 2 listas, una con los datos y otra con las etiquetas
def dir_to_lists(files_path,label):
    files = os.listdir(files_path)
    data = []
    for _file in files:
        data.append(get_data_txt(files_path+'/'+_file))
    labels = [label]*len(data)
    
    return (data,labels)

In [None]:
#funcion para guardar todos los datos de los archivos txt separados en un unico CSV
def data_to_csv(comments,labels,name):
    data = {'comments': comments, 'labels':labels}
    df = pd.DataFrame.from_dict(data)
    df.to_csv(name,sep=',', index=False)
    
    return df

neg_train = dir_to_lists('/home/josh/MEGA/U_S_VII/Ingenieria_del_conocimiento/Proyecto/dataset/aclImdb/train/neg',0)
pos_train = dir_to_lists('/home/josh/MEGA/U_S_VII/Ingenieria_del_conocimiento/Proyecto/dataset/aclImdb/train/pos',1)
neg_test = dir_to_lists('/home/josh/MEGA/U_S_VII/Ingenieria_del_conocimiento/Proyecto/dataset/aclImdb/test/neg',0)
pos_test = dir_to_lists('/home/josh/MEGA/U_S_VII/Ingenieria_del_conocimiento/Proyecto/dataset/aclImdb/train/pos',1)


data = pos_train[0]+neg_train[0]+pos_test[0]+neg_test[0]
labels = pos_train[1]+neg_train[1]+pos_test[1]+neg_test[1]

df = data_to_csv(data,labels,'./data_complete.csv')

In [3]:
#se crea la funcion para crear el arreglo con los indices de las palabras que estan en el diccionario
word_index = keras.datasets.imdb.get_word_index()

def get_indices_from_review(review):
    regex = re.compile(r'[!"#$%&\()*+,-./:;<=>?@\[\]\\^_`{|}~]')
    s = regex.sub('', review)
    # 2 is "unknown"
    sequence = map(lambda word: word_index.get(word, 2) + 3, s.lower().split())
    sequence = map(lambda index: 2 if index >= 30000 else index, sequence)
    # 1 is "start of sequence"
    return [1] + list(sequence)

In [4]:
# el vector que retorna la funcion anterior lo pasamos a un vector de 0 y 1, cuyo tamaño es el numero de palabras que tenemos en el diccionario
def vectorize_sequences(sequences, dim):
    vec = np.zeros(shape=(len(sequences), dim), dtype=np.float32)
    for i, seq in enumerate(sequences):
        vec[i, seq] = 1
    return vec
#vectorize_sequences([[1,5,2],[7,1,2],[9,5,2]],dim=10)

In [5]:
# defnicion del modelo, se utilizaron 30000 palabras del diccionario, en este modelo no se utilizo dropout
model_no_drop = keras.Sequential([
  keras.layers.Dense(units=18, activation='relu', input_shape=(30000,)),
  keras.layers.Dense(units=16, activation='relu'),
  keras.layers.Dense(units=1, activation='sigmoid')
], name='comments_review')
model_no_drop.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 18)                540018    
_________________________________________________________________
dense_1 (Dense)              (None, 16)                304       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 540,339
Trainable params: 540,339
Non-trainable params: 0
_________________________________________________________________


In [6]:
# eliminacion de palabras de parada de un comentario
stop_words = ['a','about','above','after','again','all','an','and','any','as','at','be','because','been','before','being','below','between','both','but','by','down','during','each',
 'few','for','from','further','he','her','here','hers','herself','him','himself','his','how','if','in','into','it',"it's",'its','itself','just','ll','m','ma','me','my','myself',
 'now','o','of','on','once','only','or','other','our','ours','ourselves','out','over','own','re','s','same','she',"she's",'so','such','t','than','that',"that'll",'the','their','theirs',
 'them','themselves','then','there','these','they','this','those','through','to','too','under','until','up','ve','what','when','where','which','while','who','whom','why','will',
 'with','won','y','you',"you'd","you'll","you're","you've",'your','yours','yourself','yourselves']

def clean_data(str):
    wordsFiltered = []
    words = nltk.word_tokenize(str)
    for w in words:
        if w not in stop_words and len(w)!=1:
            wordsFiltered.append(w)
    return " ".join(wordsFiltered)

data_stop = pd.read_csv('./data_complete.csv')

data_stop['comments_clean'] = data_stop['comments'].apply(clean_data)

x_stop = data_stop['comments_clean']
y_stop = data_stop['labels']



x_index_stop = []
for i in x_stop:
    x_index_stop.append(get_indices_from_review(i))


In [7]:
# particionamiento de los datos que ni tienen palabras de parada
x_stop_ = vectorize_sequences(x_index_stop,dim=30000)
y_stop_ = np.expand_dims(np.asarray(y_stop, dtype=np.float32), axis=-1)
x_train_stop,x_val_test_stop,y_train_stop, y_val_test_stop = train_test_split(x_stop_,y_stop_,test_size=0.3,shuffle=True)
x_val_stop,x_test_stop,y_val_stop, y_test_stop = train_test_split(x_val_test_stop,y_val_test_stop,test_size=0.3,shuffle=True)

In [None]:
data = pd.read_csv('./data_complete.csv')
x = data['comments']
y = data['labels']


x_index = []
for i in x:
    x_index.append(get_indices_from_review(i))


In [None]:
# particinamiento de los datos con palabras de parada
x_ = vectorize_sequences(x_index,dim=30000)
y_ = np.expand_dims(np.asarray(y, dtype=np.float32), axis=-1)
x_train,x_val_test,y_train, y_val_test = train_test_split(x_,y_,test_size=0.3,shuffle=True)
x_val,x_test,y_val, y_test = train_test_split(x_val_test,y_val_test,test_size=0.3,shuffle=True)

In [None]:
# entrenamiento del modelo sin dropout
model_no_drop.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model_no_drop.fit(x_train, y_train,
          epochs=20, batch_size=32,
          validation_data=(x_val, y_val),
          callbacks=[keras.callbacks.TensorBoard(),TrainValTensorBoard(write_graph=False,log_dir='logs_comments_review_2')])

In [None]:
#Evaluacion del modelo con el conjunto de pruebas
test_loss, test_acc = model_no_drop.evaluate(x=x_test, y=y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

In [None]:
# prueba del modelo con un comentario desconocido por el modelo
review = """Endgame consists almost entirely of the downtime scenes that were always secretly everyone's favorite parts of these movies anyway."""
review_vec = get_indices_from_review(review)
vec = vectorize_sequences([review_vec], dim=30000)
print(vec)
res = np.squeeze(model_no_drop.predict(vec))
print('test: {:.4f}%'.format(res * 100))

In [8]:
# definicion del modelo con dropout para evitar el sobreajuste
model = keras.Sequential([
  keras.layers.Dense(units=18, activation='relu', input_shape=(30000,)),
  keras.layers.Dropout(0.7),
  keras.layers.Dense(units=16, activation='relu'),
  keras.layers.Dropout(0.7),
  keras.layers.Dense(units=1, activation='sigmoid')
], name='with_dropout')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 18)                540018    
_________________________________________________________________
dropout (Dropout)            (None, 18)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                304       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 540,339
Trainable params: 540,339
Non-trainable params: 0
_________________________________________________________________


In [9]:
# entrenamiento del modelo con dropout
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(x_train_stop, y_train_stop,
          epochs=20, batch_size=512,
          validation_data=(x_val_stop, y_val_stop),
          callbacks=[keras.callbacks.TensorBoard(log_dir='logs_dropout'),TrainValTensorBoard(write_graph=False,log_dir='logs_dropout_2')])

Train on 35000 samples, validate on 10500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f81b446ca58>

In [10]:
# evaluacion del modelo con el conjunto de pruebas
test_loss, test_acc = model.evaluate(x=x_test_stop, y=y_test_stop)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

Test loss: 0.274165578211
Test accuracy: 0.932000000106


In [18]:
# prueba del modelo con un comentario desconocido por el modelo
review = """Booksmart is, well, a smart comedy; it's rich with diversity and representation, new perspectives, up-and-coming actors getting their chance to break through into the mainstream, and toes the line between crude and moving"""
review_vec = get_indices_from_review(review)
vec = vectorize_sequences([review_vec], dim=30000)
print(vec)
res = np.squeeze(model.predict(vec))
print('test: {:.4f}%'.format(res * 100))

[[ 0.  1.  0. ...,  0.  0.  0.]]
test: 99.8395%


In [19]:
model.save('./comments_review.h5')