In [1]:
import numpy as np
import codecs
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Input
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.layers import Dropout
from keras.models import Model
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from sklearn import svm
import pickle

Using TensorFlow backend.


In [2]:
path = '/content/drive/My Drive/Sentiment Analysis/'
def load_embeddings(embedding_path, glove_len):
  weight_vectors = [np.zeros((glove_len, ))]
  word_idx = {}
  with codecs.open(embedding_path, encoding='utf-8') as f:
    for line in f:
      word, vec = line.split(u' ', 1)
      word_idx[word.lower()] = len(weight_vectors)
      weight_vectors.append(np.array(vec.split(), dtype=np.float32))
  word_idx[u'-LRB-'] = word_idx.pop(u'(')
  word_idx[u'-RRB-'] = word_idx.pop(u')')
  weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
  return np.asarray(np.stack(weight_vectors)), word_idx

emb_matrix, word_idx = load_embeddings(path+'Data/glove_6B_100d.txt', 100)
max_len = 56

In [3]:
X_train = np.load(path+'Data/X_train.npy')
X_test = np.load(path+'Data/X_test.npy')
X_dev = np.load(path+'Data/X_dev.npy')
Y_train = np.load(path+'Data/Y_train.npy')
Y_test = np.load(path+'Data/Y_test.npy')
Y_dev = np.load(path+'Data/Y_dev.npy')
X_train = np.concatenate((X_train, X_test), axis=0)
Y_train = np.concatenate((Y_train, Y_test), axis=0)
print(Y_train.shape)

(179247, 10)


In [4]:
print(X_train.shape)

(179247, 56)


In [5]:
def pretrained_embedding_layer(emb_matrix):
    embedding_layer = Embedding(emb_matrix.shape[0],emb_matrix.shape[1], trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

def make_model(input_shape, emb_matrix):
    phrase_indices = Input(shape=input_shape, dtype = 'int32')
    emb_layer = pretrained_embedding_layer(emb_matrix)
    embeddings = emb_layer(phrase_indices)   
    X = Bidirectional(LSTM(128, return_sequences=True))(embeddings)
    X = Bidirectional(LSTM(128))(X)
    X = Dense(512, activation='relu')(X)
    X = Dense(10, activation='softmax')(X)
    model = Model(inputs=phrase_indices, outputs=X)
    return model

weight_path = path+'Data/model11.h5'
model = make_model((max_len,), emb_matrix)
model.load_weights(weight_path)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [6]:
p_train = model.predict(X_train)
np.save(path+'Predictions/model11train.npy', p_train)

In [7]:
print(p_train.shape)

(179247, 10)


In [8]:
Y_train = np.argmax(Y_train, axis=1)
print(Y_train.shape)

(179247,)


In [None]:
clf = svm.SVC()
clf.fit(p_train, Y_train)
with open(path+'Predictions/svm9train.pickle', 'wb') as f:
  pickle.dump(clf, f)

In [None]:
pred = clf.predict(p_train)
np.save(path+'Predictions/model9svmtrain.npy', pred)

In [9]:
Y_trainbin = (Y_train>5)
# predbin = (pred>5)
p_trainbin = (np.argmax(p_train, axis=1)>5)

In [10]:
print(np.sum(p_trainbin==Y_trainbin)/p_train.shape[0])

0.8278520700485922


In [None]:
p_test = model.predict(X_test)
np.save(path+'Predictions/model6test.npy', p_test)
pred_test = clf.predict(p_test)

In [None]:
Y_test = np.argmax(Y_test, axis=1)
Y_testbin = (Y_test>5)
pred_testbin = (pred_test>5)

In [None]:
print(pred_testbin.shape)

(59661,)


In [None]:
print(np.sum(pred_testbin==Y_testbin)/pred_test.shape[0])

0.7892593151304873


In [None]:
np.save(path+'Predictions/pred6train.npy', pred)
np.save(path+'Predictions/pred6test.npy', pred_test)

In [None]:
clfbin = svm.SVC()
clfbin.fit(p_train, Y_trainbin)
with open(path+'Predictions/svm6trainbin.pickle', 'wb') as f:
  pickle.dump(clfbin, f)

In [None]:
pred_binclass = clfbin.predict(p_train)

In [None]:
np.save(path+'Predictions/pred6trainbin.npy', pred_binclass)

In [None]:
print(np.sum(pred_binclass==Y_trainbin)/pred_binclass.shape[0])

0.8035472379709999


In [None]:
print(np.sum(pred==Y_train)/pred.shape[0])

0.43533523991102635


In [None]:
pred_testbinclass = clfbin.predict(p_test)
np.save(path+'Predictions/pred6testbin.npy', pred_testbinclass)

In [None]:
print(np.sum(pred_testbinclass==Y_testbin)/pred_testbinclass.shape[0])

0.7906840314443271


In [None]:
sum = 0
for i in range(pred.shape[0]):
  # if pred[i]<4 and Y_train[i]<4:
  #   sum += 1
  # if pred[i]>5 and Y_train[i]>5:
  #   sum += 1
  # if pred[i]<6 and Y_train[i]<6 and pred[i]>3 and Y_train[i]>3:
  #   sum += 1
  if abs(pred[i]-Y_train[i]) <= 0:
    sum += 1
print(sum/pred.shape[0])

0.43533523991102635


In [None]:
model.fit(X_train, Y_train, epochs = 20, batch_size = 3000, shuffle=True, callbacks=[cp_callback])
model.save_weights("/content/drive/My Drive/Sentiment Analysis/Data/model9.h5")