# LSTM

## Global Imports
Keras - deep learning library

In [16]:
import pandas as pd
import re
import csv
import datetime
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dropout

## Importing dataset
Imports a certain number of lines from the dataset and adds the sentiment column, according to the number of stars given in the review.

In [7]:
filename = '/home/ec2-user/SageMaker/y2.csv'
# Importing 30000 lines from the CSV
data = pd.read_csv(filename, error_bad_lines=False)[:30000]
# Removing unnecessary columns
data = data.drop(['user_id','business_id','date','funny','cool','useful'],axis=1)
# Adding the sentiment column
data['sentiment'] = ['pos' if (x > 3) else 'neutral' if (x == 3) else 'neg' for x in data['stars']]
data.head()

Unnamed: 0,review_id,stars,text,sentiment
0,vkVSCC7xljjrAI4UGfnKEQ,5,Super simple place but amazing nonetheless. It...,pos
1,n6QzIUObkYshz4dz2QRJTw,5,Small unassuming place that changes their menu...,pos
2,MV3CcKScW05u5LVfF6ok0g,5,Lester's is located in a beautiful neighborhoo...,pos
3,IXvOzsEMYtiJI0CARmj77Q,4,Love coming here. Yes the place always needs t...,pos
4,L_9BTb55X0GDtThi6GlZ6w,4,Had their chocolate almond croissant and it wa...,pos


## Pre-processing
Remove special characters and tokenize the text, to break the sentences into tokens, without pontuaction or spacing.
The tokenizer is a class from Keras that prepares text for deep learning.

**Arguments:**
- **num_words:** the maximum number of words to keep, based on word frequency. Only the most common num_words words will be kept.
- **lower:** boolean. Whether to convert the texts to lowercase.
- **split:** str. Separator for word splitting.

**Outputs:**
- **word_counts:** A dictionary of words and their counts.
- **word_docs:** A dictionary of words and how many documents each appeared in.
- **word_index:** A dictionary of words and their uniquely assigned integers.
- **document_count:** An integer count of the total number of documents that were used to fit the Tokenizer.

In [None]:
# Remove special characters
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

# Tokenizes the text (2500 words, turn lowercase and split on spacing)
tokenizer = Tokenizer(num_words=2500, lower=True,split=' ')

tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X) # Ensures that all sequences have the same length

# Summarize what was learned
print(tokenizer.word_counts)

## LSTM Model
Model, with:
- 1 Embedding Layer
- 1 LSTM Layer
- 1 Dense Layer with softmax activation (normalizes the vector into a probability distribution)

In [12]:
embed_dim = 128 # Size of the vocabulary
lstm_out = 300 # size of the LSTM layer output
batch_size= 32 # Iterate the training data in batches of size 32

## Building the LSTM network

model = Sequential() # To create sequential models
model.add(Embedding(2500, embed_dim,input_length = X.shape[1])) #transform each word in an integer
model.add(LSTM(lstm_out)) # LSTM layer
model.add(Dropout(0.6)) # Applies Dropout to the input (randomly setting a fraction rate of input units to 0 at each update during training time, which helps prevent overfitting)
model.add(Dense(3, activation='softmax')) # Dense Layer
model.compile(optimizer='adam', # required argument
              loss = 'categorical_crossentropy', # required argument
              metrics = ['accuracy']) # judge the performance of the model
model.summary()

  train_symbol = func(*args, **kwargs)
  test_symbol = func(*args, **kwargs)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 911, 128)          320000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 300)               514800    
_________________________________________________________________
dropout_4 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 903       
Total params: 835,703
Trainable params: 835,703
Non-trainable params: 0
_________________________________________________________________


## Training
80% of the samples used for training and 20% used for testing.
Running with only 1 epoch.

In [None]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20)

# Trains the model for a given number of epochs (iterations on a dataset).
model.fit(X_train, Y_train, batch_size = batch_size, epochs = 1, verbose = 1, validation_split=0.1)

# Measuring score and accuracy on validation set
score,acc = model.evaluate(X_test, Y_test, verbose = 0, batch_size = batch_size)
print("Loss score: %.2f" % (score))
print("Test Accuracy: %.2f" % (acc*100))

Train on 21600 samples, validate on 2400 samples
Epoch 1/1


  force_init=force_init)




## Prediction
Predicting all the training data, that is 20% of the input

In [17]:
start = datetime.datetime.now()
classes = model.predict(X_test, batch_size=32)
end = datetime.datetime.now()
print(end - start)
print(classes)

0:00:47.635268
[[ 0.33294457  0.33262652  0.33442888]
 [ 0.33391529  0.33356822  0.33251652]
 [ 0.33155784  0.33269012  0.33575207]
 ..., 
 [ 0.33614403  0.33158755  0.33226845]
 [ 0.33493704  0.33306998  0.33199292]
 [ 0.33371884  0.33333763  0.33294347]]
