# Import Libraries

In [130]:
import os
import re
import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords

### Some Constants ###
train_data_path = "../data/sentiment_train.tsv"
test_data_path = "../data/sentiment_test.tsv"

# Loading and preprocessing data

In [131]:
train_df = pd.read_table(train_data_path, header=0)
test_df  = pd.read_table(test_data_path, header=0)
features_text = train_df['Phrase'].values
labels = train_df['Sentiment'].values
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [132]:
print('[INFO] Cleaning text data ...')

def text_cleaning(text):
    forbidden_words = set(stopwords.words('english'))
    if text:
        text = ' '.join(text.split('.'))
        text = re.sub('\/',' ',text)
        text = re.sub(r'\\',' ',text)
        text = re.sub(r'((http)\S+)','',text)
        text = re.sub(r'\s+', ' ', re.sub('[^A-Za-z]', ' ', text.strip().lower())).strip()
        text = re.sub(r'\W+', ' ', text.strip().lower()).strip()
        text = [word for word in text.split() if word not in forbidden_words]
        return text
    return []

features_text_cleaned = [] # array of text after cleaning
labels_new = []
num_empty = 0 # number of empty sequences after cleaning

with tqdm.tqdm(total=len(features_text)) as pbar:
    for i, x in enumerate(features_text):
        text = text_cleaning(x)
        if(len(text) <= 1): # do not accept sentences less that 2 words
            num_empty += 1
            pbar.update(1)
            continue

        features_text_cleaned.append(' '.join(text))
        labels_new.append(labels[i])
        pbar.update(1)
        
features_text_cleaned = np.array(features_text_cleaned)
labels = np.array(labels_new)
print(f'[INFO] Number of sentences : {features_text_cleaned.shape[0]}')
print(f'[INFO] Number of empty sequences : {num_empty}')

  2%|▏         | 2782/156060 [00:00<00:11, 13922.17it/s]

[INFO] Cleaning text data ...


100%|██████████| 156060/156060 [00:11<00:00, 13773.46it/s]

[INFO] Number of sentences : 121956
[INFO] Number of empty sequences : 34104





In [133]:
print(features_text_cleaned[0:20])

['series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story'
 'series escapades demonstrating adage good goose'
 'escapades demonstrating adage good goose'
 'escapades demonstrating adage good goose'
 'demonstrating adage good goose' 'demonstrating adage' 'good goose'
 'good goose' 'good goose' 'good goose'
 'also good gander occasionally amuses none amounts much story'
 'also good gander occasionally amuses none amounts much story'
 'good gander occasionally amuses none amounts much story'
 'gander occasionally amuses none amounts much story'
 'gander occasionally amuses none amounts much story'
 'occasionally amuses none amounts much story'
 'occasionally amuses none amounts much story'
 'amuses none amounts much story' 'none amounts much story'
 'none amounts much story']


In [134]:
max_seq_len = 200
tokenizer = Tokenizer(char_level=False, oov_token="<PAD>")
tokenizer.fit_on_texts(features_text_cleaned)

features = tokenizer.texts_to_sequences(features_text_cleaned)
features = pad_sequences(features, maxlen=max_seq_len)
features = np.array(features)
vocab_size = np.max(features) + 1

x_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.3333)
print(f'[INFO] Training size : {x_train.shape[0]}')
print(f'[INFO] Validate size : {x_val.shape[0]}')
print(x_train.shape, y_train.shape)

[INFO] Training size : 81308
[INFO] Validate size : 40648
(81308, 200) (81308,)


In [135]:
model = Sequential()
inputs = Input(shape=(max_seq_len,), dtype="int32")
# Embed each integer in a 128-dimensional vector
model.add(inputs)
model.add(Embedding(vocab_size, 128))
# Add 2 bidirectional LSTMs
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
# Add a classifier
model.add(Dense(5, activation="sigmoid"))
#model = keras.Model(inputs, outputs)
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 200, 128)          1914624   
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 200, 128)          98816     
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 645       
Total params: 2,112,901
Trainable params: 2,112,901
Non-trainable params: 0
_________________________________________________________________


In [136]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0b83f22370>