In [None]:
"""
.. module:: SentimentTwAir

SentimentTwAir
*************

:Description: SentimentTwAir


:Version: 

:Created on: 07/09/2017 9:05 

"""

import os
import sys
import pandas
from sklearn.metrics import confusion_matrix, classification_report
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN
from tensorflow.keras.optimizers import RMSprop, SGD, Adam, Adamax
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import np_utils
from collections import Counter
import argparse
import time
import sklearn

def tweet_to_words(raw_tweet):
    """
    Only keeps ascii characters in the tweet and discards @words

    :param raw_tweet:
    :return:
    """
    letters_only = re.sub("[^a-zA-Z@]", " ", raw_tweet)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if not re.match("^[@]", w)]
    return " ".join(meaningful_words)


parser = argparse.ArgumentParser()
parser.add_argument('--verbose', help="Verbose output (enables Keras verbose output)", action='store_true', default=True)
args = parser.parse_args(args=[])

verbose = 1 if args.verbose else 0
impl = 2

print("Starting:", time.ctime())

In [None]:
'''############################################
# Data

#    Tweet = pandas.read_csv("Airlines.csv")
#    Tweet = pandas.read_csv("Presidential.csv")
Tweet = pandas.read_csv("amazon_alexa.tsv", sep='\t')
#Tweet = pandas.read_csv("kindle_reviews.csv")

# Shuffle el dataset
Tweet = sklearn.utils.shuffle(Tweet)

# El numero de rows que queremos que tenga el dataset
#df_num_of_rows = int(Tweet.shape[0] / 320)
#df_num_of_rows = df_num_of_rows + df_num_of_rows

#Tweet = Tweet.iloc[:df_num_of_rows,:]

# Pre-process the tweet and store in a separate column
#Tweet['clean_tweet'] = Tweet['reviewText'].apply(lambda x: tweet_to_words(str(x)))
Tweet['clean_tweet'] = Tweet['verified_reviews'].apply(lambda x: tweet_to_words(str(x)))
# Convert sentiment to binary
#Tweet['sentiment'] = Tweet['overall'].apply(lambda x: x - 1)
Tweet['sentiment'] = Tweet['feedback'].apply(lambda x: x)

# Join all the words in review to build a corpus
all_text = ' '.join(Tweet['clean_tweet'])
words = all_text.split()

# Convert words to integers
counts = Counter(words)

numwords = 200  # Limit the number of words to use
vocab = sorted(counts, key=counts.get, reverse=True)[:numwords]
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

tweet_ints = []
for each in Tweet['clean_tweet']:
    tweet_ints.append([vocab_to_int[word] for word in each.split() if word in vocab_to_int])

# Create a list of labels
labels = np.array(Tweet['sentiment'])

# Find the number of tweets with zero length after the data pre-processing
tweet_len = Counter([len(x) for x in tweet_ints])
print("Zero-length reviews: {}".format(tweet_len[0]))
print("Maximum tweet length: {}".format(max(tweet_len)))

# Remove those tweets with zero length and its corresponding label
tweet_idx = [idx for idx, tweet in enumerate(tweet_ints) if len(tweet) > 0]
labels = labels[tweet_idx]
Tweet = Tweet.iloc[tweet_idx]
tweet_ints = [tweet for tweet in tweet_ints if len(tweet) > 0]

seq_len = max(tweet_len)
features = np.zeros((len(tweet_ints), seq_len), dtype=int)
for i, row in enumerate(tweet_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

split_frac = 0.8
split_idx = int(len(features) * 0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x) * 0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
        "\nValidation set: \t{}".format(val_x.shape),
        "\nTest set: \t\t{}".format(test_x.shape))

print("Train set: \t\t{}".format(train_y.shape),
        "\nValidation set: \t{}".format(val_y.shape),
        "\nTest set: \t\t{}".format(test_y.shape))

# Save dataset preprocessed
np.save("train_x", train_x)
np.save("train_y", train_y)
np.save("val_x", val_x)
np.save("val_y", val_y)
np.save("test_x", test_x)
np.save("test_y", test_y)'''

In [None]:
# Load dataset preprocessed
train_x = np.load("train_x.npy")
train_y = np.load("train_y.npy")
val_x = np.load("val_x.npy")
val_y = np.load("val_y.npy")
test_x = np.load("test_x.npy")
test_y = np.load("test_y.npy")

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
        "\nValidation set: \t{}".format(val_x.shape),
        "\nTest set: \t\t{}".format(test_x.shape))

print("Train set: \t\t{}".format(train_y.shape),
        "\nValidation set: \t{}".format(val_y.shape),
        "\nTest set: \t\t{}".format(test_y.shape))

In [None]:
# Estos valores habria que cambiarlo tambien arriba en caso necesario
numwords = 200  # Limit the number of words to use
seq_len = train_x.shape[1]
NUM_OF_CLASSES = 2

############################################
# Model
drop = 0.25
nlayers = 1  # >= 1
RNN = SimpleRNN

neurons = 64
embedding = 20

model = Sequential()
model.add(Embedding(numwords + 1, embedding, input_length=seq_len))

if nlayers == 1:
    model.add(RNN(neurons, implementation=impl, recurrent_dropout=drop))
else:
    model.add(RNN(neurons, implementation=impl, recurrent_dropout=drop, return_sequences=True))
    for i in range(1, nlayers - 1):
        model.add(RNN(neurons, recurrent_dropout=drop, implementation=impl, return_sequences=True))
    model.add(RNN(neurons, recurrent_dropout=drop, implementation=impl))

model.add(Dense(NUM_OF_CLASSES))
model.add(Activation('softmax'))

############################################
# Training

learning_rate = 0.000001
#optimizer = SGD(lr=learning_rate, momentum=0.95)
optimizer = Adam(lr=learning_rate)
#optimizer = Adamax(lr=learning_rate)
#optimizer = RMSprop(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

#print(model.summary())

#model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=tf.contrib.tpu.TPUDistributionStrategy(
#    tf.contrib.cluster_resolver.TPUClusterResolver(tpu='grpc://10.0.101.2:8470'))
#)

epochs = 500
batch_size = 32

train_y_c = np_utils.to_categorical(train_y, NUM_OF_CLASSES)
val_y_c = np_utils.to_categorical(val_y, NUM_OF_CLASSES)

# Callbacks
callbacks_list = [
    TensorBoard(log_dir="logs/{}".format(time.time())),
    EarlyStopping(monitor="val_acc", patience=10),
]

model.fit(train_x, train_y_c,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(val_x, val_y_c),
            verbose=verbose, callbacks=callbacks_list)

############################################
# Results

test_y_c = np_utils.to_categorical(test_y, NUM_OF_CLASSES)
score, acc = model.evaluate(test_x, test_y_c,
                                batch_size=batch_size,
                                verbose=verbose)
print()
print('Test ACC=', acc)

'''test_pred = model.predict_classes(test_x, verbose=verbose)

print()
print('Confusion Matrix')
print('-'*20)
print(confusion_matrix(test_y, test_pred))
print()
print('Classification Report')
print('-'*40)
print(classification_report(test_y, test_pred))
print()
print("Ending:", time.ctime())'''

In [None]:
print()
print("Ending:", time.ctime())

print(train_y)

In [None]:
for i in range(0, len(train_y)):
    print(train_y[i])