# Preprocesamiento de los datos
#### http://ai.stanford.edu/~amaas/data/sentiment/

In [18]:
import os
import pandas as pd
import re
import keras
import numpy as np

def get_data_txt(file_path):
    _file = open(file_path,'r')
    data = _file.read()
    symbols = re.compile(r'[!"#$%&\()*+,-./:;<=>?@\[\]\\^_`{|}~]')
    clean_data = symbols.sub('', data)
    return clean_data

In [19]:
def dir_to_lists(files_path,label):
    files = os.listdir(files_path)
    data = []
    for _file in files:
        data.append(get_data_txt(files_path+'/'+_file))
    labels = [label]*len(data)
    
    return (data,labels)

In [21]:
def data_to_csv(comments,labels,name):
    data = {'comments': comments, 'labels':labels}
    df = pd.DataFrame.from_dict(data)
    df.to_csv(name,sep=',', index=False)
    
    return df

neg = dir_to_lists('/home/josh/MEGA/U_S_VII/Ingenieria_del_conocimiento/Proyecto/dataset/aclImdb/train/neg',0)
pos = dir_to_lists('/home/josh/MEGA/U_S_VII/Ingenieria_del_conocimiento/Proyecto/dataset/aclImdb/train/pos',1)

data = pos[0]+neg[0]
labels = pos[1]+neg[1]

df = data_to_csv(data,labels,'./train2.csv')

In [65]:
word_index = keras.datasets.imdb.get_word_index()

def get_indices_from_review(review):
    regex = re.compile(r'[!"#$%&\()*+,-./:;<=>?@\[\]\\^_`{|}~]')
    s = regex.sub('', review)
    # 2 is "unknown"
    sequence = map(lambda word: word_index.get(word, 2) + 3, s.lower().split())
    sequence = map(lambda index: 2 if index >= 30000 else index, sequence)
    # 1 is "start of sequence"
    return [1] + list(sequence)

In [66]:
def vectorize_sequences(sequences, dim):
    vec = np.zeros(shape=(len(sequences), dim), dtype=np.float32)
    for i, seq in enumerate(sequences):
        vec[i, seq] = 1
    return vec
vectorize_sequences([[1,5,2],[7,1,2],[9,5,2]],dim=10)

array([[0., 1., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 1., 0., 0., 0., 1.]], dtype=float32)

In [67]:
model = keras.Sequential([
  keras.layers.Dense(units=18, activation='relu', input_shape=(30000,)),
  keras.layers.Dense(units=16, activation='relu'),
  keras.layers.Dense(units=1, activation='sigmoid')
], name='comments_review')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 18)                540018    
_________________________________________________________________
dense_17 (Dense)             (None, 16)                304       
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 17        
Total params: 540,339
Trainable params: 540,339
Non-trainable params: 0
_________________________________________________________________


In [68]:
data = pd.read_csv('./train2.csv')
x = data['comments']
y = data['labels']

x_index = []
for i in x:
    x_index.append(get_indices_from_review(i))


In [70]:
x_train = vectorize_sequences(x_index,dim=30000)
y_train = np.expand_dims(np.asarray(y, dtype=np.float32), axis=-1)

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [73]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          epochs=20, batch_size=32,
          callbacks=[keras.callbacks.TensorBoard(log_dir='logs_comments_review')])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f240b489f28>

In [102]:
review = """the special effects was not incredible"""
review_vec = get_indices_from_review(review)
vec = vectorize_sequences([review_vec], dim=30000)
print(vec)
res = np.squeeze(model.predict(vec))
print('test: {:.4f}%'.format(res * 100))

[[0. 1. 0. ... 0. 0. 0.]]
test: 76.7952%


In [104]:
model = keras.Sequential([
  keras.layers.Dense(units=18, activation='relu', input_shape=(30000,)),
  keras.layers.Dropout(0.7),
  keras.layers.Dense(units=16, activation='relu'),
  keras.layers.Dropout(0.7),
  keras.layers.Dense(units=1, activation='sigmoid')
], name='with_dropout')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 18)                540018    
_________________________________________________________________
dropout_1 (Dropout)          (None, 18)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 16)                304       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 17        
Total params: 540,339
Trainable params: 540,339
Non-trainable params: 0
_________________________________________________________________


In [106]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          epochs=20, batch_size=512,
          callbacks=[keras.callbacks.TensorBoard(log_dir='logs_dropout')])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f262439e908>

In [112]:
review = """I do not like cartoons. When I was a child, I preferred reading and drawing to watching television, cartoons included–which makes the fact that I am a movie reviewer somewhat ironic. Anyways, cartoons always felt boring to me: unrealistic colorful characters talking and acting like real people looked too far-fetched to relax and enjoy. Be it the hysterical madness of old Looney Tunes, Disney’s cheesy fairy tales, or Pixar’s 3D family movies, to me it is all the same. I guess you can picture me as an arrogant, heartless cartoon hater. Which I am not, but whatever."""
review_vec = get_indices_from_review(review)
vec = vectorize_sequences([review_vec], dim=30000)
print(vec)
res = np.squeeze(model.predict(vec))
print('test: {:.4f}%'.format(res * 100))

[[0. 1. 1. ... 0. 0. 0.]]
test: 99.8589%


In [79]:
import keras 
num_words = 5000
(train_data, train_labels), (test_data, test_labels) = keras.datasets.imdb.load_data(num_words=num_words)

In [25]:
print(train_labels[0])
print(train_data[0])

1
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]


In [27]:
word_index = keras.datasets.imdb.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
