In [26]:
'''
train cnn mode for sentiment classification on yelp data set
author: hao peng
'''
import pandas as pd
import numpy as np
from Word2VecUtility import Word2VecUtility
from gensim.models import word2vec
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
# from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

data = pd.read_csv('review_sub_399850.tsv', header=0, delimiter="\t", quoting=3, encoding='utf-8')
model = word2vec.Word2Vec.load("300features_40minwords_10context")
print model.syn0.shape
print model["chinese"]
print model.doesnt_match("man woman child kitchen".split())
print model.doesnt_match("coffee tea juice restaurant".split())
print model.most_similar("delicious")
print model.most_similar("chinese")

# data embedding parameters
max_length = 100
# max_words = 5000
# max_words = model.syn0.shape[0]
num_features = 300

# model training parameters
batch_size = 32
# embedding_dims = 100
nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 2

# index trick parameters
index_from = 3
# padding = 0
start = 1
oov = 2

words_set = set(model.index2word)
word2index = { word : (i + index_from) for i,word in enumerate(words_set) }
index2word = { i : word for word, i in word2index.items() }
index2word[0] = '0'
index2word[1] = '1'
index2word[2] = '2'
# 'Word2Vec' object does not support item assignment
padding_model = {}
padding_model['0'] = np.random.standard_normal(num_features)
padding_model['1'] = np.random.standard_normal(num_features)
padding_model['2'] = np.random.standard_normal(num_features)


reviews_words = []
for review in data["text"]:
    review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True)
    # each word index increased with 3.
    review_words = [start] + [word2index[w] if (w in words_set) else oov for w in review_words]
#   review_words = [oov if (ix > (max_words + index_from)) else ix for ix in review_words]
    reviews_words.append(review_words)

# padding with 0, each review has max_length now.
reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post')
print reviews_words.shape


(12597, 300)
[-0.09085934 -0.04050202 -0.07604051 -0.02878256 -0.03832901  0.04651292
 -0.08233552 -0.06436986 -0.006646    0.01952864 -0.10288478 -0.03497215
 -0.04002167 -0.0277764  -0.03175622  0.0141541   0.06412307  0.0514068
 -0.04425988 -0.01241343 -0.00785599 -0.0206115   0.03097875 -0.01636746
  0.12936752 -0.04187576 -0.04594978  0.0632828  -0.0185187  -0.03435634
  0.02050968 -0.00153008  0.04422459  0.08578489  0.0569248  -0.13749051
  0.07906641 -0.08986761 -0.06780145  0.03066873 -0.07235949  0.00491482
 -0.05130845 -0.03616726  0.02364809  0.00438806  0.03820136 -0.02138964
  0.01468734  0.0239164   0.06650317 -0.01117458  0.08711758  0.02350685
  0.00737275 -0.03050523  0.01972778 -0.00599776  0.00697179  0.03140137
  0.01172278 -0.00411805 -0.09804209 -0.06642748 -0.01673794  0.04739327
 -0.00381328 -0.10510307 -0.06244999 -0.03497938 -0.02515736 -0.05637315
 -0.03300777 -0.02991769 -0.00337767  0.01365327  0.03197937 -0.01513318
 -0.00577635 -0.00223164 -0.04746583  0

In [47]:
data_matrix = np.empty((reviews_words.shape[0], max_length, num_features))
print data_matrix.shape

(399850, 100, 300)


In [27]:
print reviews_words[:20, :12]

[[    1  2903  1184  4192  4272  8238  9911   619  2446  4470  6337  7137]
 [    1   159  8305  2446  8305  5323  6822  1591  6674  3217  1153  6124]
 [    1   438 10258  7152  4718  4679  4645  3813  2342 11460  4345  6058]
 [    1  4099  7972  9232 10792 10659  8279 10439 10850  7486  7972   755]
 [    1   509  4281  4501  6639   853   159   904  4281  4501  2446  2764]
 [    1 10456  8828  4035  5285   448 11832  4924  6689  5375  1066  5132]
 [    1  1066   438 10654  5091  3316  3956     2  4501  9584  8907  6337]
 [    1  2980   965  2342  9790  5604  5695  8763  1381  4603  9319  7693]
 [    1  4129  6424   159 10772  4807  2425  4501     2  5025  7805  9915]
 [    1  1623  4170  9815  3969  8831  5577  4158 10650 11240  4977  9254]
 [    1  9077  5323  1184   651  8336  2903  1255  6545  5323  6124  8694]
 [    1 11496  1042  5323  2901  2758   395 11005  7170  7362  3354  7192]
 [    1   280  9378  8828     2  1091  6500 10154  3771    12 11067  9312]
 [    1  8452  4967     0

In [43]:
# print ([index2word[ix] for ix in reviews_words[0]])
print np.array([model[index2word[ix]] if (index2word[ix] in model) else padding_model[index2word[ix]] for ix in reviews_words[0]])

[[-0.39257071  1.36350982  1.10130961 ..., -0.32868679 -0.10299574
  -0.98516773]
 [-0.07528156  0.0281019  -0.12271535 ...,  0.02202644 -0.05857712
   0.07863296]
 [-0.02730094  0.01866139 -0.09900852 ...,  0.04338717  0.1285717
   0.02086206]
 ..., 
 [ 1.44873275  1.5635236   3.13083889 ...,  0.04700205  0.65883317
  -1.29727728]
 [ 1.44873275  1.5635236   3.13083889 ...,  0.04700205  0.65883317
  -1.29727728]
 [ 1.44873275  1.5635236   3.13083889 ...,  0.04700205  0.65883317
  -1.29727728]]


In [None]:
data_matrix = np.empty((reviews_words.shape[0], max_length, num_features))
for i in xrange(0, reviews_words.shape[0]):
    data_matrix[i,:,:] = np.array([model[index2word[ix]] if (index2word[ix] in model) else padding_model[index2word[ix]] for ix in reviews_words[0]])

# del(reviews_words)

labels = data["stars"]
print labels[:10], labels.shape
labels[labels <= 3] = 0
labels[labels > 3] = 1
print labels[:10]
print (labels == 0).sum()

index = np.arange(data_matrix.shape[0])
train_index, valid_index = train_test_split(index, train_size = 0.8, random_state = 520)
del(index, data)

train_data = data_matrix[train_index]
valid_data = data_matrix[valid_index]
train_labels = labels[train_index]
valid_labels = labels[valid_index]

print train_data.shape
print valid_data.shape

In [None]:
print train_data.shape
print valid_data.shape

In [None]:
print "start training model..."

# model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(Dropout(0.25))

# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:

# filter_length is like filter size, subsample_length is like step in 2D CNN.
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
# we use standard max pooling (halving the output of the previous layer):
model.add(MaxPooling1D(pool_length=2))

# We flatten the output of the conv layer,
# so that we can add a vanilla dense layer:
model.add(Flatten())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              class_mode='binary')
model.fit(train_data, train_labels, batch_size=batch_size,
          nb_epoch=nb_epoch, show_accuracy=True,
          validation_data=(valid_data, valid_labels))
