# Modeling with Keras

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, confusion_matrix
from bs4 import BeautifulSoup

Using TensorFlow backend.


In [21]:
train_df = pd.read_csv('../data/cleantrain.csv')

In [6]:
tokenizer = Tokenizer(num_words=6000)

In [7]:
tokenizer.fit_on_texts(train_df.cleanreview)

In [8]:
list_tokenized_train = tokenizer.texts_to_sequences(train_df.cleanreview)

In [9]:
X_train = pad_sequences(list_tokenized_train, maxlen=240)

In [10]:
y = train_df.sentiment

In [11]:
model = Sequential()

W0516 12:48:05.160784 4790240704 deprecation_wrapper.py:119] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [12]:
model.add(Embedding(input_dim=6000, output_dim=128))

W0516 12:49:41.656661 4790240704 deprecation_wrapper.py:119] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0516 12:49:41.661226 4790240704 deprecation_wrapper.py:119] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [13]:
model.add(Bidirectional(LSTM(32, return_sequences=True)))

In [14]:
model.add(GlobalMaxPool1D())

In [15]:
model.add(Dense(20, activation='relu'))

In [16]:
model.add(Dropout(.05))

W0516 12:51:20.713818 4790240704 deprecation_wrapper.py:119] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0516 12:51:20.720930 4790240704 deprecation.py:506] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [17]:
model.add(Dense(1, activation='sigmoid'))

In [18]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

W0516 12:52:00.688607 4790240704 deprecation_wrapper.py:119] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0516 12:52:00.708776 4790240704 deprecation_wrapper.py:119] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0516 12:52:00.712639 4790240704 deprecation_wrapper.py:119] From /Users/Hovanes/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3378: The name tf.nn.sigmoid_cross_entropy_with_logits is deprecated. Please use tf.nn.sigmoid_cross_entropy_with_logits instead.



In [19]:
model.fit(X_train, y, batch_size=64, epochs=5, validation_split=.25)

Train on 18750 samples, validate on 6250 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fad51c60978>

In [62]:
test_df = pd.read_csv('../data/testData.tsv', header=0, delimiter='\t', quoting=3)

Bringing in my previously created custom function

In [61]:
def review_to_wordlist(review, remove_stopwords=False):
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    # 5. Return the list of words as a paragraph by joining them together
    return(" ".join(words))

In [64]:
lemmatizer = WordNetLemmatizer()

In [68]:
test_df.review = test_df.review.apply(lambda x: review_to_wordlist(x))

In [87]:
test_df['sentiment'] = test_df['id'].map(lambda x: 1 if int(x.strip('"').split('_')[1]) >= 7 else 0)

In [88]:
test_df.head()

Unnamed: 0,id,review,sentiment
0,"""12311_10""",naturally in a film who s main themes are of m...,1
1,"""8348_2""",this movie is a disaster within a disaster fil...,0
2,"""5828_4""",all in all this is a movie for kids we saw it ...,0
3,"""7186_2""",afraid of the dark left me with the impression...,0
4,"""12128_7""",a very accurate depiction of small time mob li...,1


In [89]:
y_test = test_df.sentiment

In [90]:
list_tokenized_test = tokenizer.texts_to_sequences(test_df.review)

In [91]:
X_test = pad_sequences(list_tokenized_test, maxlen=240)

In [92]:
prediction = model.predict(X_test)

In [106]:
y_pred = (prediction > .5)

In [107]:
print('F1-score: {0}'.format(f1_score(y_pred, y_test)))

F1-score: 0.851212221853205


In [108]:
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)

Confusion matrix:


array([[11164,  2248],
       [ 1336, 10252]])

In [112]:
submission = pd.DataFrame(data={'id':test_df['id']})

In [116]:
submission['sentiment'] = y_pred

In [121]:
submission.sentiment.replace({True:1, False:0}, inplace=True)

In [123]:
submission.to_csv('./data/submission.csv', index=False, quoting=3)