In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [1]:
import pandas as pd
import json
import numpy as np
import codecs
from nltk.tokenize import RegexpTokenizer
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Input
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.layers import Dropout
from keras.models import Model
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
def read_data():
    
    df_data_sentence = pd.read_table('/content/drive/My Drive/Sentiment Analysis/Data/dictionary.txt')
    df_data_sentence_processed = df_data_sentence['Phrase|Index'].str.split('|', expand=True)
    df_data_sentence_processed = df_data_sentence_processed.rename(columns={0: 'Phrase', 1: 'phrase_ids'})
    
    df_data_sentiment = pd.read_table('/content/drive/My Drive/Sentiment Analysis/Data/sentiment_labels.txt')
    df_data_sentiment_processed = df_data_sentiment['phrase ids|sentiment values'].str.split('|', expand=True)
    df_data_sentiment_processed = df_data_sentiment_processed.rename(columns={0: 'phrase_ids', 1: 'sentiment_values'})

    df_processed_all = df_data_sentence_processed.merge(df_data_sentiment_processed, how='inner', on='phrase_ids')
    return df_processed_all

def training_data_split(all_data, spitPercent):

    msk = np.random.rand(len(all_data)) < spitPercent
    train_only = all_data[msk]
    test_and_dev = all_data[~msk]


    msk_test = np.random.rand(len(test_and_dev)) <0.5
    test_only = test_and_dev[msk_test]
    dev_only = test_and_dev[~msk_test]

    dev_only.to_csv('dev.csv')
    test_only.to_csv('test.csv')
    train_only.to_csv('train.csv')

    return train_only, test_only, dev_only

def extract_constants(df):
    maxLen = -1
    for phrase in df['Phrase']:
        splitLine = phrase.split()
        maxLen = max(maxLen, len(splitLine))
    return len(df['Phrase']), maxLen


def load_embeddings(embedding_path, glove_len):
  """Loads embedings, returns weight matrix and dict from words to indices."""
  print('loading word embeddings from %s' % embedding_path)
  weight_vectors = [np.zeros((glove_len, ))]
  word_idx = {}
  with codecs.open(embedding_path, encoding='utf-8') as f:
    for line in f:
      word, vec = line.split(u' ', 1)
      word_idx[word.lower()] = len(weight_vectors)
      weight_vectors.append(np.array(vec.split(), dtype=np.float32))
  word_idx[u'-LRB-'] = word_idx.pop(u'(')
  word_idx[u'-RRB-'] = word_idx.pop(u')')
  weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
  return np.asarray(np.stack(weight_vectors)), word_idx


def get_X(data, word_idx, max_seq_len):


    maxSeqLength = max_seq_len
    no_rows = len(data)
    ids = np.zeros((no_rows, maxSeqLength), dtype='int32')

    word_idx_lwr =  {k.lower(): v for k, v in word_idx.items()}
    idx = 0

    for index, row in data.iterrows():


        sentence = (row['Phrase'])
        #print (sentence)
        tokenizer = RegexpTokenizer(r'\w+')
        sentence_words = tokenizer.tokenize(sentence)
        i = 0
        for word in sentence_words:
            word_lwr = word.lower()
            try:
                ids[idx][i] =  word_idx_lwr[word_lwr]

            except Exception as e:
                ids[idx][i] = len(word_idx_lwr)+1
                # continue
            i = i + 1
        idx = idx + 1

    return ids

def get_Y(data):

    labels = data['sentiment_values']

    lables_float = labels.astype(float)

    cats = ['0','1','2','3','4','5','6','7','8','9']
    labels_mult = round((lables_float * 10)).astype(int)
    dummies = pd.get_dummies(labels_mult, prefix='', prefix_sep='')
    dummies = dummies.T.reindex(cats).T.fillna(0)
    labels_matrix = dummies.to_numpy()

    return labels_matrix

def pretrained_embedding_layer(emb_matrix):
    print(emb_matrix.shape)
    embedding_layer = Embedding(emb_matrix.shape[0],emb_matrix.shape[1], trainable=False)
    embedding_layer.build((None,))

    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

def get_example_X(sentence, word_idx, max_seq_len):
    maxSeqLength = max_seq_len #Maximum length of sentence
    ids = np.zeros((1, maxSeqLength), dtype='int32')
    word_idx_lwr =  {k.lower(): v for k, v in word_idx.items()}

    tokenizer = RegexpTokenizer(r'\w+')
    sentence_words = tokenizer.tokenize(sentence)
    i = 0
    for word in sentence_words:
        word_lwr = word.lower()
        try:
            ids[0][i] =  word_idx_lwr[word_lwr]

        except Exception as e:
            ids[0][i] = len(word_idx_lwr)+1
            # continue
        i = i + 1

    return ids
def evaluate(sentence, model):
    X_example = get_example_X(sentence, word_idx, max_len)
    score = model.predict(X_example, batch_size=1, verbose=0)
    top_3_index = np.argsort(score)[0][-3:]
    top_3_scores = score[0][top_3_index]
    top_3_weights = top_3_scores/np.sum(top_3_scores)
    single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)
    print(single_score_dot)
    if(bool(round(single_score_dot))):
        print('Positive!')
    else:
        print('Negative!')
def getbool(Y):
  ans = []
  for y in Y:
    top_3_index = np.argsort(y)[-3:]
    top_3_scores = y[top_3_index]
    top_3_weights = top_3_scores/np.sum(top_3_scores)
    single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)
    if(bool(round(single_score_dot))):
        ans.append(1)
    else:
        ans.append(0)
  return np.asarray(ans)
np.random.seed(0)

In [3]:
print('Reading data...')
df = read_data()
df_train = pd.read_csv('/content/drive/My Drive/Sentiment Analysis/Data/train.csv', header = 0)
df_test = pd.read_csv('/content/drive/My Drive/Sentiment Analysis/Data/test.csv', header = 0)
df_dev = pd.read_csv('/content/drive/My Drive/Sentiment Analysis/Data/dev.csv', header = 0)
# print(df_train, df_test, df_dev)
m, max_len = extract_constants(df)
# print(m, max_len)
emb_matrix, word_idx = load_embeddings('/content/drive/My Drive/Sentiment Analysis/Data/glove_6B_100d.txt', 100)
X_train = get_X(df_train, word_idx, max_len)
X_test = get_X(df_test, word_idx, max_len)
X_dev = get_X(df_dev, word_idx, max_len)
# print(X_train)
# print(X_test)
# print(X_dev)
Y_train = get_Y(df_train)
Y_test = get_Y(df_test)
Y_dev = get_Y(df_dev)

# print(emb_matrix[400001])
# print(df_train, X_train, Y_train)

Reading data...
loading word embeddings from /content/drive/My Drive/Sentiment Analysis/Data/glove_6B_100d.txt


In [None]:
print(max_len)

56


In [5]:
path = '/content/drive/My Drive/Sentiment Analysis/Data/'
np.save(path+'X_train.npy', X_train)
np.save(path+'X_test.npy', X_test)
np.save(path+'X_dev.npy', X_dev)
np.save(path+'Y_train.npy', Y_train)
np.save(path+'Y_test.npy', Y_test)
np.save(path+'Y_dev.npy', Y_dev)

In [None]:
emb_layer = pretrained_embedding_layer(emb_matrix)
print(emb_matrix, emb_layer.get_weights()[0])

(400002, 100)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.038194   -0.24487001  0.72812003 ... -0.1459      0.82779998
   0.27061999]
 [-0.10767     0.11053     0.59811997 ... -0.83155     0.45293
   0.082577  ]
 ...
 [-0.10461    -0.50470001 -0.49331    ...  0.42526999 -0.51249999
  -0.17054   ]
 [ 0.28365001 -0.62629998 -0.44351    ...  0.43678001 -0.82607001
  -0.15701   ]
 [ 0.00488135  0.02151894  0.01027634 ... -0.04798925  0.032894
  -0.04953045]] [[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.038194   -0.24487     0.72812    ... -0.1459      0.8278
   0.27062   ]
 [-0.10767     0.11053     0.59812    ... -0.83155     0.45293
   0.082577  ]
 ...
 [-0.10461    -0.5047     -0.49331    ...  0.42527    -0.5125
  -0.17054   ]
 [ 0.28365    -0.6263     -0.44351    ...  0.43678    -0.82607
  -0.15701   ]
 [ 0.00488135  0.02151894  0.01027634 ... -0.04798925  0.032894
  -0.04953045]]


In [None]:
def make_model(input_shape, emb_matrix):
    
    phrase_indices = Input(shape=input_shape, dtype = 'int32')
    
    emb_layer = pretrained_embedding_layer(emb_matrix)
    
    embeddings = emb_layer(phrase_indices)   
    
    X = Bidirectional(LSTM(128, return_sequences=True))(embeddings)
    # X = Dropout(0.50)(X)
    X = Bidirectional(LSTM(128))(X)
    X = Dense(1024, activation='relu')(X)
    # X = Dropout(0.50)(X)
    X = Dense(10, activation='softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=phrase_indices, outputs=X)
    
    ### END CODE HERE ###
    # model 8
    return model

In [None]:
model = make_model((max_len,), emb_matrix)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

(400002, 100)
Model: "model_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        (None, 56)                0         
_________________________________________________________________
embedding_26 (Embedding)     (None, 56, 100)           40000200  
_________________________________________________________________
bidirectional_46 (Bidirectio (None, 56, 256)           234496    
_________________________________________________________________
bidirectional_47 (Bidirectio (None, 256)               394240    
_________________________________________________________________
dense_49 (Dense)             (None, 1024)              263168    
_________________________________________________________________
dense_50 (Dense)             (None, 10)                10250     
Total params: 40,902,354
Trainable params: 902,154
Non-trainable params: 40,000,200
__________________________

In [None]:
checkpoint_path = "/content/drive/My Drive/Sentiment Analysis/Data/cpnew8.ckpt"
cp_callback = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=False)
model.fit(X_train, Y_train, epochs = 9, batch_size = 3000, shuffle=True, callbacks=[cp_callback])
model.save_weights("/content/drive/My Drive/Sentiment Analysis/Data/model8.h5")

Epoch 1/9

Epoch 00001: saving model to /content/drive/My Drive/Sentiment Analysis/Data/cpnew8.ckpt
Epoch 2/9

Epoch 00002: saving model to /content/drive/My Drive/Sentiment Analysis/Data/cpnew8.ckpt
Epoch 3/9

KeyboardInterrupt: ignored

In [None]:
weight_path = '/content/drive/My Drive/Sentiment Analysis/Data/cpnew8.ckpt'
loaded_model = make_model((max_len,), emb_matrix)
loaded_model.load_weights(weight_path)
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# loss_dev, acc_dev = loaded_model.evaluate(X_dev, Y_dev)
# print(loss_dev, acc_dev)

(400002, 100)


In [None]:
# loss_train, acc_train = loaded_model.evaluate(X_train, Y_train)
# print(loss_train, acc_train)

1.434077559624677 0.43081966042518616


In [None]:
# loss_test, acc_test = loaded_model.evaluate(X_test, Y_test)
# print(loss_test, acc_test)

1.474054392238473 0.41995498538017273


In [None]:
# Low accuracy on classification on 10 classes but high on binary positive/negative classification.
# No need for regularization as accuracy on test set is equal to that of the train.
# Model can add more LSTM layers to better fit the training set.
sentence = 'nice'
evaluate(sentence, loaded_model)

0.67
Positive!


In [None]:
# p1 = loaded_model.predict(X_train)
# np.save('/content/drive/My Drive/Sentiment Analysis/Data/p1.npy', p1)
# p2 = loaded_model.predict(X_test)
# np.save('/content/drive/My Drive/Sentiment Analysis/Data/p2.npy', p2)
p3 = loaded_model.predict(X_dev)
np.save('/content/drive/My Drive/Sentiment Analysis/Data/p3.npy', p3)

In [None]:
# p1 = np.load('/content/drive/My Drive/Sentiment Analysis/Data/p1.npy')
# p1 = getbool(p1)
# Yp1 = getbool(Y_train)
# acc_train_bool = np.sum((p1 == Yp1))/p1.shape[0]
# p2 = np.load('/content/drive/My Drive/Sentiment Analysis/Data/p2.npy')
# p2 = getbool(p2)
# Yp2 = getbool(Y_test)
# acc_test_bool = np.sum((p2 == Yp2))/p2.shape[0]
p3 = np.load('/content/drive/My Drive/Sentiment Analysis/Data/p3.npy')
p3 = getbool(p3)
Yp3 = getbool(Y_dev)
acc_dev_bool = np.sum((p3 == Yp3))/p3.shape[0]
# print(acc_train_bool, acc_test_bool, acc_dev_bool)
print(acc_dev_bool)



0.686338251229474
