In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense, Activation
from tensorflow.keras import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras import optimizers
import nltk

First, we need to load the pretrained word embeddings. We will use GloVe. 

In [2]:
#We will use this dictionary to store every vector corresponding to every word.
embeddings_dict = {}
n_words_model = 30

In [3]:
#In the text file, all the values are separated by a space. 
#The first value is the word, and the next 300 values are the vectors components
with open("glove.6B.300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [4]:
#Let's test our word embedding
print(embeddings_dict["."])

[-1.2559e-01  1.3630e-02  1.0306e-01 -1.0123e-01  9.8128e-02  1.3627e-01
 -1.0721e-01  2.3697e-01  3.2870e-01 -1.6785e+00  2.2393e-01  1.2409e-01
 -8.6708e-02  3.3010e-01  3.4375e-01 -8.7582e-04 -2.9658e-01  2.4417e-01
 -1.1592e-01 -3.5742e-02 -1.0830e-02  2.0776e-01  2.9285e-01 -7.3491e-02
 -1.8598e-01 -2.0090e-01 -9.5366e-02  6.3732e-03 -1.3620e-01  9.2028e-02
 -3.9957e-02  1.9027e-01 -1.0456e-01  2.7670e-03 -7.1742e-01 -1.2915e-01
 -1.3451e-03  2.7002e-01 -5.3023e-02  2.2148e-01  1.3881e-01 -1.5051e-01
 -1.9150e-01  1.6402e-01  9.7484e-02  5.6841e-02  3.9789e-01  4.0725e-01
  1.4802e-01  2.1569e-01 -1.0671e-01 -1.0232e-01  2.4810e-02 -2.2100e-01
 -1.0720e-02  1.4234e-01 -2.8242e-01  1.9254e-01  8.6720e-02 -3.8970e-01
  1.1321e-01  1.3779e-03  6.4009e-03 -1.6206e-01 -8.2153e-02 -5.5397e-01
  3.6789e-01 -4.0159e-03  2.0710e-01 -3.7157e-01  2.5135e-01 -1.9544e-01
 -4.7059e-02  1.7155e-01 -2.4036e-01 -4.6086e-02  1.9429e-01 -1.8939e-01
 -7.1974e-03  6.9481e-02  5.9175e-02 -1.7585e-01  1

In [5]:
#Define a word embedding for unknown word with random values
embeddings_dict['UNK'] = np.random.randn(1,300)*0.001
#embeddings_dict['UNK'] = embeddings_dict['UNK']/(embeddings_dict['UNK']**2)

print(embeddings_dict['UNK'])

[[-5.90358085e-06 -1.19096007e-03 -1.49708314e-03  6.39434550e-05
   3.04065780e-04  6.07081989e-04  5.52570342e-05 -1.06642593e-04
  -9.29549562e-05  7.59098021e-04 -1.61068822e-03  6.14387690e-04
   5.36723573e-05 -1.30443975e-03 -3.71704130e-03  1.02124550e-03
   2.22533149e-04  7.55519701e-05 -1.81694651e-04 -7.04529735e-04
   7.12508029e-06  8.49930481e-04 -1.59072001e-04  1.26696970e-03
  -3.89776249e-04 -1.46072554e-03 -2.71664648e-03  7.91806018e-04
   3.32597377e-04  2.52611574e-04  7.92106451e-04 -1.67142961e-03
  -9.94131422e-04 -2.05603628e-04 -1.79913989e-03  4.66575134e-04
  -4.39789617e-05 -2.35717970e-04  5.35240166e-04 -1.86112347e-03
   1.59427703e-03  2.09764197e-03  1.77891201e-03  7.66572549e-04
   1.19911454e-03  3.44824744e-04  1.62335578e-03 -1.63695676e-03
   8.71035814e-04 -1.47054823e-03 -8.91756397e-04  1.99355673e-04
  -1.85216027e-04 -2.27524520e-04  2.23924952e-03 -9.07293329e-04
  -1.07500311e-03 -7.82842649e-04  1.07751712e-03 -3.47122656e-04
   6.19850

In [6]:
filepath_dict = {'yelp':   'sentiment_analysis/yelp_labelled.txt',
                 'amazon': 'sentiment_analysis/amazon_cells_labelled.txt',
                 'imdb':   'sentiment_analysis/imdb_labelled.txt'}

In [7]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [8]:
df_yelp = df_list[0]
df_amazon = df_list[1]
df_imdb = df_list[2]

In [9]:
df_amazon

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
995,The screen does get smudged easily because it ...,0,amazon
996,What a piece of junk.. I lose more calls on th...,0,amazon
997,Item Does Not Match Picture.,0,amazon
998,The only thing that disappoint me is the infra...,0,amazon


In [10]:
#Let's create a function to vectorize our data:

def text_to_matrix(text,n_words, n_features):
    # Input
    ## text : is a string with the paragraph or text that needs to be vectorized
    # Output
    ## vector: is a matrix of dimensions (150,300), 150 words by 300 features.
    
    #First we need to separate our text by spaces
    vector_list = []
    tokenizer = nltk.tokenize.word_tokenize
    text = tokenizer(text)
    
    for word in text:
        if(word!=''):
            try: 
                vector_list.append(embeddings_dict[word.lower()].reshape(1,n_features))
            except:
                vector_list.append(embeddings_dict['UNK'].reshape(1,n_features))
        if(len(vector_list)==n_words):
            break 
    zero_padding = np.zeros((1,n_features))
    for i in range(n_words-len(vector_list)):
        vector_list.append(zero_padding)
        
    vector = np.asarray(vector_list)
    return vector


    

In [11]:
#Test of the text_to_matrix function 
a = text_to_matrix("hi, my name is Josue. Why aren't you talking to me.",n_words_model,300)
a = np.reshape(a,(a.shape[0],300))

In [12]:
print(a.shape)
print(a)

(30, 300)
[[ 0.40838    -0.18427999 -0.17573    ... -0.52645999  0.81630999
   0.74274999]
 [-0.25538999 -0.25723001  0.13169    ... -0.23289999 -0.12226
   0.35499001]
 [-0.22746    -0.13658001 -0.38997    ... -0.18444    -0.38227999
   0.55346   ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [13]:
#Create a function to vectorize all the dataset
def vectorize_dataset(df, n_words, n_features):
    y = df['label'].to_numpy()
    X = df['sentence'].to_numpy()
    X = np.reshape(X, (X.shape[0], 1))
    #f = lambda x: text_to_matrix(str(x),150,300)
    #func = np.vectorize(f)
    #x = func(X)
    return X, y

In [14]:
X, Y= vectorize_dataset(df_amazon, n_words_model,300)

In [15]:

list_examples = []
for text in X:
    list_examples.append(text_to_matrix(str(text),n_words_model,300))
X = np.asarray(list_examples)

In [16]:
X=X.reshape(1000,n_words_model,300)
X.shape

(1000, 30, 300)

In [17]:
Y= Y.reshape(1000,1)
Y.shape

(1000, 1)

In [38]:
#Now we create our model
def lstm_model(LSTM_hidden_units):
    
    input_layer = Input(shape=(n_words_model,300), dtype= "float32")
    
    X = LSTM(units = LSTM_hidden_units, return_sequences= True)(input_layer)
    X = Dropout(0.3)(X)
    X = LSTM(units = LSTM_hidden_units, return_sequences = False)(X)
    X = Dropout(0.3)(X)
    
  
    X = Dense(1)(X)
    out = Activation('sigmoid')(X)
   
    model = Model(inputs= input_layer, outputs=out)
    
    return model
    
    

In [39]:
model = lstm_model(64)

In [40]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 30, 300)]         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 30, 64)            93440     
_________________________________________________________________
dropout_4 (Dropout)          (None, 30, 64)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0   

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

In [42]:
opt = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['binary_accuracy'])

In [43]:
model.fit(X_train, y_train, epochs = 120, batch_size = 64, shuffle=False)

Train on 900 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120


<tensorflow.python.keras.callbacks.History at 0x2a0acb9cfd0>

In [1]:
model.evaluate(X_test,y_test, verbose=1)

NameError: name 'model' is not defined

In [48]:
x_test1=  text_to_matrix("I really enjoyed my time with her",n_words_model,300)
x_test1 = np.reshape(x_test1,(1,x_test1.shape[0],300))
if(model.predict(x_test1)>0.5):
    print("Positive review")
else:
    print("Negative review")

Positive review
