# Sentiment analysis on IMDB movie reviews using GloVe embeddings and deep LSTM network

This is a draft only showing the ability to convert an example of an IMDB movie review into a vectorized representation using a 50-dimensional GloVe word embedding

# Imports

In [14]:
import numpy as np

# Load 50 dimensional word embedding matrix (GloVe)

In [18]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [19]:
glove_path = "C:/Users/ianti_000/Desktop/imdb_sentiment_analysis/glove.6B/glove.6B.50d.txt"

In [20]:
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs(glove_path)

# Word embedding Example

In [52]:
word = "scientist"
try:
    print(word,'\n', word_to_vec_map[word])
except:
    print(word, " not found")

scientist 
 [-0.1574     0.60965    0.33007    0.099683   0.48562    0.047824  -0.73879
 -0.53599    0.32418    0.12761    0.59657    0.40019    0.04229    0.67915
  0.17035   -0.0092601  0.5116     0.87818   -0.69902    0.89466   -0.12511
  0.82933    0.12066   -0.27192    1.0609    -2.2313    -0.52398   -0.88116
 -1.0067     0.65938    0.92674   -1.8196    -0.71182   -1.4838    -0.2425
 -0.070873  -0.66168    0.93268    1.1923     0.10765    0.39583    0.60167
  0.21796    0.021389   1.2813    -0.0055883  0.25846    0.45233   -0.11514
  0.54043  ]


# Convert training example into vectorized version using word embeddings

In [84]:
# Read one of the examples
negative_example = "C:/Users/ianti_000/Desktop/imdb_sentiment_analysis/aclImdb_v1/aclImdb/train/neg/0_3.txt"
file = open(negative_example, 'r')
words = file.read()

# Clean the example, make lowercase, and split into array of words
words = words.replace("'","").replace(","," ,").replace("."," .").lower().split()

print("There are ", len(words), " words in this example")
words

There are  121  words in this example


['story',
 'of',
 'a',
 'man',
 'who',
 'has',
 'unnatural',
 'feelings',
 'for',
 'a',
 'pig',
 '.',
 'starts',
 'out',
 'with',
 'a',
 'opening',
 'scene',
 'that',
 'is',
 'a',
 'terrific',
 'example',
 'of',
 'absurd',
 'comedy',
 '.',
 'a',
 'formal',
 'orchestra',
 'audience',
 'is',
 'turned',
 'into',
 'an',
 'insane',
 ',',
 'violent',
 'mob',
 'by',
 'the',
 'crazy',
 'chantings',
 'of',
 'its',
 'singers',
 '.',
 'unfortunately',
 'it',
 'stays',
 'absurd',
 'the',
 'whole',
 'time',
 'with',
 'no',
 'general',
 'narrative',
 'eventually',
 'making',
 'it',
 'just',
 'too',
 'off',
 'putting',
 '.',
 'even',
 'those',
 'from',
 'the',
 'era',
 'should',
 'be',
 'turned',
 'off',
 '.',
 'the',
 'cryptic',
 'dialogue',
 'would',
 'make',
 'shakespeare',
 'seem',
 'easy',
 'to',
 'a',
 'third',
 'grader',
 '.',
 'on',
 'a',
 'technical',
 'level',
 'its',
 'better',
 'than',
 'you',
 'might',
 'think',
 'with',
 'some',
 'good',
 'cinematography',
 'by',
 'future',
 'great',
 '

In [75]:
glove_representation = []

for word in words:
    try:
        glove_representation.append(word_to_vec_map[word])
    except:
        print(word, " not found")
glove_representation = np.array(glove_representation)

chantings  not found


In [76]:
glove_representation.shape

(120, 50)

In [80]:
glove_representation

array([[ 0.48251 ,  0.87746 , -0.23455 , ..., -0.4112  ,  0.23625 ,
         0.26451 ],
       [ 0.70853 ,  0.57088 , -0.4716  , ..., -0.22562 , -0.093918,
        -0.80375 ],
       [ 0.21705 ,  0.46515 , -0.46757 , ..., -0.043782,  0.41013 ,  0.1796  ],
       ..., 
       [ 0.55561 ,  0.1704  ,  0.13692 , ..., -0.32978 ,  0.24825 ,
        -0.38275 ],
       [-0.24154 , -0.30059 ,  0.1622  , ..., -1.0468  , -0.52729 ,
        -0.60561 ],
       [ 0.15164 ,  0.30177 , -0.16763 , ..., -0.35652 ,  0.016413,
         0.10216 ]])