In [2]:
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
%matplotlib inline 
from gensim.models import word2vec

Using TensorFlow backend.


In [3]:

w2v_model = word2vec.Word2Vec.load('./w2v_model')

In [4]:
data = pd.read_json('./chinese_restaurants_review.json', lines=True).sample(frac = 0.3,random_state = 1024 )
data = data[['text','stars']]

In [5]:
# get token and stemming, also remove punctuation
import nltk
import string
import time
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
def token(x):
    
    x = x.lower()
    token = nltk.word_tokenize(x)
    
    # remove stop words
    stopWords = set(stopwords.words('english'))
    token = [x for x in token if x not in stopWords]
    
    # transform each token into string and remove punctuation
    t = [x.encode('utf-8').translate(None, string.punctuation) for x in token ]
    
    # after removing punctuation, some token may become ' '
    return [x for x in t if x != '']

begin = time.time()
data['token'] = data.text.apply(lambda x: token(x))
print ('Total time spent:', time.time() - begin)

('Total time spent:', 86.55466985702515)


In [6]:
# make a set of words we have learned in model
word_set = set(w2v_model.wv.index2word)

In [7]:
num_features = 300

In [8]:
# transform each word in token into feature vector (300,) trained by w2v_model
# If a review has less than 100 words, we pad it into length 100 with np.zeros(300)
# So in embedd_token, every review is of length exactly 100



def getEmbedding(token):
    
    
    
    # replace words in token by its corresponding embedding vector
    embedding_list = []
    
    i = 0
    
    for x in token:
        if i >= 40: break
        if x in word_set:
            embedding_list.extend(w2v_model[x])
            i += 1
    # do nothing if the text review has no words in w2v_model we trained
    if i> 0: 
        while i < 40: 
            embedding_list.extend(np.zeros(300))
            i += 1
       
    
    return embedding_list
begin = time.time()
data['embed_token'] = data.token.apply(lambda token: getEmbedding(token))
print ('Total time spent:', time.time() - begin)

('Total time spent:', 223.88262915611267)


In [9]:
# remove data with text review having no words in w2v_model we trained

data = data[data.embed_token.apply(lambda k: len(k)) != 0]



In [10]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [11]:
begin = time.time()
embed_token = np.array([k for k in data.embed_token])
print ('Total time spent:', time.time() - begin)

('Total time spent:', 424.09057903289795)


In [12]:

X_train,  X_test, y_train, y_test = train_test_split(embed_token, data['stars'], test_size = 0.3, random_state = 1024)

In [13]:
# Place holders
feature_size = 300
hidden_size = 100
batch_size = 500
n_class = 5

# length of each review, if it is shorter 
text_size = 40


n_epoch = 10

In [14]:
x = tf.placeholder(tf.float64, [None, text_size, feature_size], name = 'x')
y = tf.placeholder(tf.float64, [None, 5], name = 'y')

In [15]:
lstm_cell_1 = tf.contrib.rnn.LSTMCell(hidden_size)
lstm_cell_2 = tf.contrib.rnn.LSTMCell(hidden_size)
multi_lstm_cells =  tf.contrib.rnn.MultiRNNCell([lstm_cell_1,lstm_cell_2],state_is_tuple=True)
_, final_state = tf.nn.dynamic_rnn(multi_lstm_cells, x, dtype=tf.float64)

In [16]:
def final_layer(input_, output_size):
    shape = input_.get_shape().as_list()

    output_layer = { 'weight': tf.Variable(tf.random_normal([shape[-1], output_size], dtype=tf.float64)),
                'bias' : tf.Variable(tf.random_normal([output_size],dtype=tf.float64))}
    return tf.matmul(input_, output_layer['weight']) + output_layer['bias']


In [17]:
def RNN_model(x,y, n_epoch):
    sentiment = final_layer(final_state[-1][-1],n_class)
    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits = sentiment, labels=y))
    
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())
    
    #test = X_test.reshape((-1,text_size,300))
    #one_hot_y_test = tf.one_hot(np.array(y_test-1),5,on_value=1.0, off_value=0.0)
    
    for epoch in range(n_epoch):
        epoch_loss = 0
        
        # cut into batches
        i = 0
        while i < len(X_train):
            start = i
            end = i + batch_size
            batch_X = X_train[start: end]
            #batch_X = np.array([t for t in batch_X])
            batch_X = batch_X.reshape((-1, text_size, 300))
            #print batch_X.shape
            
            # the original y is single numeric labels, we transform into one hot 
            # but the out put of tf.one_hot is a tensor node, so we need to sess.run(batch_y)
            batch_y = \
            tf.one_hot(np.array(y_train[start: end]-1),5,on_value=1.0, off_value=0.0)
            
            
            _, c = sess.run([optimizer, cost],feed_dict={x: batch_X, y:sess.run(batch_y)})
            
            epoch_loss += c
            
            i += batch_size
        print  ('Epoch:', epoch, 'loss:', epoch_loss)
        
    
        correct = tf.equal(tf.argmax(sentiment,1), tf.argmax(y,1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
    
        test = X_test.reshape((-1,text_size,300))
        one_hot_y_test = tf.one_hot(np.array(y_test-1),5,on_value=1.0, off_value=0.0)
        print ('Accuracy', accuracy.eval(feed_dict={x: test, y: sess.run(one_hot_y_test)}))

In [18]:
begin = time.time()
RNN_model(x,y,n_epoch)
print ('Total time spent:', time.time() - begin)

('Epoch:', 0, 'loss:', 40073.990313059454)
('Accuracy', 0.51188368)
('Epoch:', 1, 'loss:', 33765.784477474517)
('Accuracy', 0.52699536)
('Epoch:', 2, 'loss:', 31930.142855675156)
('Accuracy', 0.5256474)
('Epoch:', 3, 'loss:', 30203.716844114289)
('Accuracy', 0.52174532)
('Epoch:', 4, 'loss:', 28244.021040716252)
('Accuracy', 0.5159986)
('Epoch:', 5, 'loss:', 26049.562982238203)
('Accuracy', 0.51486343)
('Epoch:', 6, 'loss:', 23737.24830912353)
('Accuracy', 0.50457609)
('Epoch:', 7, 'loss:', 22511.903482129273)
('Accuracy', 0.49400496)
('Epoch:', 8, 'loss:', 21920.191973164387)
('Accuracy', 0.50180918)
('Epoch:', 9, 'loss:', 19934.70865172286)
('Accuracy', 0.48612982)
('Total time spent:', 10150.259814977646)
