In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv( "data/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv( "data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

print("Train: %d, Test: %d, Unlabeled train: %d \n"
      % (train["review"].size, test["review"].size, unlabeled_train["review"].size ))

Train: 25000, Test: 25000, Unlabeled train: 50000 



In [3]:
def review_to_wordlist(text, remove_stopwords=False):
    '''Clean the text, with the option to remove stopwords'''
    text = BeautifulSoup(text).get_text()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = re.sub(r" +", " ", text)
    words = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return(words)

In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = [review_to_wordlist(s) for s in raw_sentences if len(raw_sentences) > 0]
    
    return sentences

sentences = []

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)
    
for review in test["review"]:
    sentences += review_to_sentences(review, tokenizer)

In [7]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = "word2vec_300features_40minwords_10context"
model.save(model_name)

Training model...


In [8]:
print(model.most_similar("awful"))
print()
print(model.most_similar("awesome"))

[('terrible', 0.7701743841171265), ('atrocious', 0.746411144733429), ('abysmal', 0.7401413917541504), ('dreadful', 0.7226380109786987), ('horrible', 0.7210754156112671), ('appalling', 0.6851282119750977), ('horrid', 0.6848453283309937), ('horrendous', 0.6756206154823303), ('lousy', 0.6487287282943726), ('bad', 0.5936079025268555)]

[('amazing', 0.7523432374000549), ('incredible', 0.6692808866500854), ('fantastic', 0.6562114953994751), ('excellent', 0.6531256437301636), ('outstanding', 0.6119468212127686), ('exceptional', 0.6035537719726562), ('great', 0.5909483432769775), ('cool', 0.5837295651435852), ('terrific', 0.5589351654052734), ('astounding', 0.5587907433509827)]


In [10]:
def make_features(words, model, num_features):
   
    features = np.zeros((num_features,),dtype="float32")
    
    nwords = 0.
    vocab = set(model.wv.vocab)

    for word in words:
        if word in vocab: 
            nwords = nwords + 1.
            features = np.add(features,model[word])

    features = np.divide(features,nwords)
    return features


def get_review_features(reviews, model, num_features):

    counter = 0
    review_features = np.zeros((len(reviews),num_features),dtype="float32")

    for review in reviews:
        review_features[counter] = make_features(review, model, num_features)
        counter = counter + 1
        
    return review_features


clean_train_reviews = [review_to_wordlist(review, remove_stopwords=True) for review in train['review']]
X_train = get_review_features(clean_train_reviews, model, num_features)


clean_test_reviews = [review_to_wordlist(review, remove_stopwords=True) for review in test['review']]
X_test = get_review_features(clean_test_reviews, model, num_features)

In [12]:
from keras.utils import to_categorical

def extract_sentiment(s):
    s = re.sub(r"[^0-9]", " ", s)
    n = int(s.split()[1])
    if n <= 5:
        return 0
    else:
        return 1

test['sentiment'] = test.apply(lambda x: extract_sentiment(x['id']), axis=1)

y_train = to_categorical(train['sentiment'], num_classes=2)
y_test = to_categorical(test['sentiment'], num_classes=2)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(25000, 300)
(25000, 2)
(25000, 300)
(25000, 2)


In [125]:
# Hyperparameters
learning_rate = 0.05
epochs = 50
display_epoch = False

# Network Parameters
n_hidden_1 = 128
n_hidden_2 = 64
n_hidden_3 = 32
num_input = X_train.shape[1]
num_classes = y_train.shape[1]

X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)

In [126]:
# Store layers weight & bias
weights = {
    'w1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'w2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'w3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, num_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [127]:
def neural_net(X):
    h1 = tf.add(tf.matmul(X, weights['w1']), biases['b1'])
    a1 = tf.nn.relu(h1, name='a1')
#     a1 = tf.nn.dropout(a1, 0.2) 

    h2 = tf.add(tf.matmul(a1, weights['w2']), biases['b2'])
    a2 = tf.nn.relu(h2, name='a2')
#     a2 = tf.nn.dropout(a2, 0.2) 

    h3 = tf.add(tf.matmul(a2, weights['w3']), biases['b3'])
    a3 = tf.nn.relu(h3, name='a3')
    
    out_layer = tf.matmul(a3, weights['out']) + biases['out']
    return out_layer

In [128]:
# Construct model
logits = neural_net(X)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [129]:
with tf.Session() as sess:

    sess.run(init)
    
    total_accuracy = 0
    
    for step in range(1, epochs+1):
        # Run optimization(backprop)
        sess.run(train_op, feed_dict={X: X_train, Y: y_train})
        if display_epoch:
            # Calculate loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: X_train,
                                                                 Y: y_train})
            print("Step " + str(step) + ", Loss= " + \
                  "{:.4f}".format(loss) + ", Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")
    print()
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: X_test,
                                      Y: y_test}))

Optimization Finished!

Testing Accuracy: 0.85012


In [130]:
# Split a dataset into k folds
def cross_validation_split(train, y, folds):
    fold_size = int(train.shape[0] / folds)

    train_val_folds = list()
    train_y_folds = list()
    
    test_val_folds = list()
    test_y_folds = list()
    
    
    for k in range(1, folds+1):
        train_val_fold = np.concatenate((train[:fold_size*(k-1)], train[fold_size*k:]), axis=0)
        train_y_fold = np.concatenate((y[:fold_size*(k-1)], y[fold_size*k:]), axis=0)
        train_val_folds.append(train_val_fold)
        train_y_folds.append(train_y_fold)
        
        test_val_fold = train[((k-1)*fold_size):(k*fold_size)]
        test_y_fold = y[((k-1)*fold_size):(k*fold_size)]
        test_val_folds.append(test_val_fold)
        test_y_folds.append(test_y_fold)
        
    return (train_val_folds, train_y_folds, test_val_folds, test_y_folds)

folds = 5
train_val_folds, train_y_folds, test_val_folds, test_y_folds = cross_validation_split(X_train, y_train, folds)

In [131]:
with tf.Session() as sess:

    sess.run(init)
    
    total_accuracy = 0
    
    for k in range(folds):
        for step in range(1, epochs+1):
            sess.run(train_op, feed_dict={X: train_val_folds[k], Y: train_y_folds[k]})
        loss, acc = sess.run([loss_op, accuracy], feed_dict={X: test_val_folds[k],Y: test_y_folds[k]})
        total_accuracy += acc
        print("Fold " + str(k) + ", Loss= " + \
              "{:.4f}".format(loss) + ", Accuracy= " + \
              "{:.3f}".format(acc))

    print("Mean Accuracy after " + str(folds) +" folds: " + "{:.3f}".format(total_accuracy/folds))
    print("Testing Accuracy:", sess.run(accuracy, feed_dict={X: X_test, Y: y_test}))

Fold 0, Loss= 0.8905, Accuracy= 0.834
Fold 1, Loss= 0.7587, Accuracy= 0.818
Fold 2, Loss= 0.4992, Accuracy= 0.840
Fold 3, Loss= 0.5436, Accuracy= 0.802
Fold 4, Loss= 0.3430, Accuracy= 0.874
Mean Accuracy after 5 folds: 0.834
Testing Accuracy: 0.8694
