In [195]:
import numpy as np
import collections
from matplotlib import pyplot as plt
import tensorflow as tf
import pandas as pd
import os
import nltk
from keras.preprocessing import sequence
%matplotlib inline

## Read data

In [196]:
DATA_DIR = "./data"

train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_data.head()

Unnamed: 0,sentence,label
0,I hate Harry Potter.,0
1,The first action theme to be played as the fir...,1
2,"Always knows what I want, not guy crazy, hates...",0
3,"Is it just me, or does Harry Potter suck?...",0
4,friday hung out with kelsie and we went and sa...,0


In [197]:
num_recs = len(train_data) # number of total sentences
print(num_recs)

5668


In [198]:
maxlen = 0 # maximum sentence length
word_freqs = collections.Counter() # word frequency

for index, row in train_data.iterrows():
    words = nltk.word_tokenize(row['sentence'].lower())
    if (len(words) > maxlen):
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
print(len(word_freqs))
print(maxlen)

# word_freqs: (word, freqs)

2094
42


In [199]:
maxlen = 50

## Read word2vec matrix 

In [200]:
import numpy as np
import os

DATA_DIR = "./data"

wordsList = np.load(os.path.join(DATA_DIR, 'wordsList.npy'))
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(DATA_DIR, 'wordVectors.npy'))
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


In [201]:
print(wordsList[:5])
print(len(wordsList))

['0', ',', '.', 'of', 'to']
400000


In [202]:
print(wordVectors[:3])
print(wordVectors.shape)

[[ 0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00]
 [ 1.3441e-02  2.3682e-01 -1.6899e-01  4.0951e-01  6.3812e-01  4.7709e-01
  -4.2852e-01 -5.5641e-01 -3.6400e-01 -2.3938e-01  1.3001e-01 -6.3734e-02
  -3.9575e-01 -4.8162e-01  2.3291e-01  9.0201e-02 -1.3324e-01  7.8639e-02
  -4.1634e-01 -1.5428e-01  1.0068e-01  4.8891e-01  3.1226e-01 -1.2520e-01
  -3.7512e-02 -1.5179e+00  1.2612e-01 -2.4420e-02 -4.2961e-02 -2.8351e-01
   3.5416e+

In [203]:
baseballIndex = wordsList.index('baseball')
wordVectors[baseballIndex]

array([-1.9327  ,  1.0421  , -0.78515 ,  0.91033 ,  0.22711 , -0.62158 ,
       -1.6493  ,  0.07686 , -0.5868  ,  0.058831,  0.35628 ,  0.68916 ,
       -0.50598 ,  0.70473 ,  1.2664  , -0.40031 , -0.020687,  0.80863 ,
       -0.90566 , -0.074054, -0.87675 , -0.6291  , -0.12685 ,  0.11524 ,
       -0.55685 , -1.6826  , -0.26291 ,  0.22632 ,  0.713   , -1.0828  ,
        2.1231  ,  0.49869 ,  0.066711, -0.48226 , -0.17897 ,  0.47699 ,
        0.16384 ,  0.16537 , -0.11506 , -0.15962 , -0.94926 , -0.42833 ,
       -0.59457 ,  1.3566  , -0.27506 ,  0.19918 , -0.36008 ,  0.55667 ,
       -0.70315 ,  0.17157 ], dtype=float32)

In [204]:
import tensorflow as tf
maxSeqLength = 10 #Maximum length of sentence
numDimensions = 300 #Dimensions for each word vector
firstSentence = np.zeros((maxSeqLength), dtype='int32')
firstSentence[0] = wordsList.index("i")
firstSentence[1] = wordsList.index("thought")
firstSentence[2] = wordsList.index("the")
firstSentence[3] = wordsList.index("movie")
firstSentence[4] = wordsList.index("was")
firstSentence[5] = wordsList.index("incredible")
firstSentence[6] = wordsList.index("and")
firstSentence[7] = wordsList.index("inspiring")
#firstSentence[8] and firstSentence[9] are going to be 0
print(firstSentence.shape)
print(firstSentence) #Shows the row index for each word

(10,)
[    41    804 201534   1005     15   7446      5  13767      0      0]


In [205]:
with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordVectors,firstSentence).eval().shape)

(10, 50)


In [206]:
train_ids = np.zeros((num_recs, maxlen), dtype='int32')
train_labels = np.zeros((num_recs, ))
for linecounter, row in train_data.iterrows():
    words = nltk.word_tokenize(row['sentence'].lower())
    indexcounter = 0
    for word in words:    
        try:
            train_ids[linecounter][indexcounter] = wordsList.index(word)
        except ValueError:
            train_ids[linecounter][indexcounter] = 39999 # vector of UNK
        indexcounter += 1
        if (indexcounter) >= maxlen:
            break
    train_labels[linecounter] = int(row['label'])

In [207]:
print(train_ids[:5])

[[    41   5281   3215   7654      2      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0]
 [201534     58    608   2984      4     30    334     19 201534   9239
     388    138     15 201534   4548    985   3173   3071   2984     42
      15      7    219  24521      4 201534  22745      1  36803      5
  194969   8208     60   3505      2      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0]
 [   690   2198    102     41    303      1     36   1856   5578      1
   19555   3215  39999      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0     

In [208]:
print(train_labels[:5])

[0. 1. 0. 0. 0.]
