In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import codecs
import math
import nltk
from nltk.tokenize import RegexpTokenizer
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Input
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.models import Model
from keras.models import load_model
nltk.download('punkt')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
path = '/content/drive/My Drive/Sentiment Analysis/'

# Load embeddings
def load_embeddings(embedding_path, glove_len):
  weight_vectors = [np.zeros((glove_len, ))]
  word_idx = {}
  with codecs.open(embedding_path, encoding='utf-8') as f:
    for line in f:
      word, vec = line.split(u' ', 1)
      word_idx[word.lower()] = len(weight_vectors)
      weight_vectors.append(np.array(vec.split(), dtype=np.float32))
  word_idx[u'-LRB-'] = word_idx.pop(u'(')
  word_idx[u'-RRB-'] = word_idx.pop(u')')
  weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
  return np.asarray(np.stack(weight_vectors)), word_idx

emb_matrix, word_idx = load_embeddings(path+'Data/glove_6B_100d.txt', 100)
max_len = 56

In [None]:
# convert list of words to their corresponding indices to feed to the network
def get_example_X(sentence_words, word_idx, max_seq_len):
    maxSeqLength = max_seq_len
    ids = np.zeros((1, maxSeqLength), dtype='int32')
    word_idx_lwr =  {k.lower(): v for k, v in word_idx.items()}
    i = 0
    for word in sentence_words:
        word_lwr = word.lower()
        try:
            ids[0][i] =  word_idx_lwr[word_lwr]

        except Exception as e:
            ids[0][i] = len(word_idx_lwr)+1
        i = i + 1
    return ids

def pretrained_embedding_layer(emb_matrix):
    embedding_layer = Embedding(emb_matrix.shape[0],emb_matrix.shape[1], trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

# Pre processing of a sentence
def process(sentence):
  tokenizer = RegexpTokenizer(r'\w+')
  sentence_words = tokenizer.tokenize(sentence)
  sentences = [sentence_words[i*int(max_len/2):min(i*int(max_len/2)+max_len, max_len)] for i in range(0, math.ceil(len(sentence_words)/max_len))]
  return sentences

# Model Architecture
def make_model(input_shape, emb_matrix):
    phrase_indices = Input(shape=input_shape, dtype = 'int32')
    emb_layer = pretrained_embedding_layer(emb_matrix)
    embeddings = emb_layer(phrase_indices)   
    X = Bidirectional(LSTM(128, return_sequences=True))(embeddings)
    X = Bidirectional(LSTM(128))(X)
    X = Dense(512, activation='relu')(X)
    X = Dense(10, activation='softmax')(X)
    model = Model(inputs=phrase_indices, outputs=X)
    return model

weight_path = path+'Data/model6.h5'
model = make_model((max_len,), emb_matrix)
model.load_weights(weight_path)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Get sentiment scores
def evaluate(sentences, model):
    if sentences != []:
      total_score = 0
      total_len = 0
      for part in sentences:
        X_example = get_example_X(part, word_idx, max_len)
        score = model.predict(X_example, batch_size=1, verbose=0)
        top_3_index = np.argsort(score)[0][-3:]
        top_3_scores = score[0][top_3_index]
        top_3_weights = top_3_scores/np.sum(top_3_scores)
        single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)
        total_score += single_score_dot*len(part)
        total_len += len(part)
      return round(total_score/total_len, 2)
    else:
      return -1

In [None]:
# Function which takes the input and output file paths as argument and ouputs the sentiment score of each sentence
def output(input_file, output_file):
  f_input = open(input_file, 'r')
  f_output = open(output_file, 'w')
  lines = f_input.readlines()
  for line in lines:
    # SPlit line into relevant sentences
    sentences = sent_tokenize(line)
    for sentence in sentences:
      # Get score of each sentence
      score = evaluate(process(sentence), model)
      # If empty sentence, do not return anything
      if score != -1:
        # Write output to output file
        f_output.write(line+'\n'+str(score)+'\n')
  f_input.close()
  f_output.close()

In [None]:
# Example
output(path+'Data/testfile.txt', path+'Data/Output/output.txt')