In [43]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential

In [27]:
train = pd.read_csv('dataset/train.tsv'',  sep="\t")
test = pd.read_csv('dataset/test.tsv',  sep="\t")

In [28]:
train.head()
train.shape

(156060, 4)

In [30]:
train['Sentiment'].value_counts()
# 0 - negative
# 1 - somewhat negative
# 2 - neutral
# 3 - somewhat positive
# 4 - positive

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [33]:
# format data to process by CNN
def format_data(train, test, max_features, maxlen):

    # shuffle training data
    train = train.sample(frac=1).reset_index(drop = True)
    
    # convert all phrases to lower case
    train['Phrase'] = train['Phrase'].apply(lambda x: x.lower())
    test['Phrase'] = test['Phrase'].apply(lambda x: x.lower())
   
    # training phrases - features
    X = train['Phrase']
    # test phrases - features
    test_X = test['Phrase']
    # sentiment values - labels (as categorical)
    Y = to_categorical(train['Sentiment'].values)
    
    # keras tokenizer, max num_words = max_features
    tokenizer = Tokenizer(num_words = max_features)
    # fit to training phrases
    tokenizer.fit_on_texts(list(X))
    
    # convert training text to sequence
    X = tokenizer.texts_to_sequences(X)
    # convert training sequence to 2D array, each element contains sequence of length 'maxlen'
    X = pad_sequences(X, maxlen = maxlen)
    # convert test text...
    test_X = tokenizer.texts_to_sequences(test_X)
    # convert test sequence to 2D array...
    test_X = pad_sequences(test_X, maxlen = maxlen)
    
    return X, Y, test_X

In [34]:
maxlen = 125
max_features = 10000

X, Y, test_X = format_data(train, test, max_features, maxlen)

In [35]:
X # training features

array([[   0,    0,    0, ...,  165,   69,   59],
       [   0,    0,    0, ...,    0,  186,  500],
       [   0,    0,    0, ..., 6908,    4, 1383],
       ...,
       [   0,    0,    0, ...,    0,    0, 5018],
       [   0,    0,    0, ...,    6, 8428,  390],
       [   0,    0,    0, ...,    0, 2612, 7696]])

In [36]:
Y # training labels

array([[0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [37]:
test_X # test features

array([[   0,    0,    0, ...,  613, 1029,  392],
       [   0,    0,    0, ...,  613, 1029,  392],
       [   0,    0,    0, ...,    0,    0,   16],
       ...,
       [   0,    0,    0, ...,    2,  126, 5773],
       [   0,    0,    0, ...,    2,  126, 5773],
       [   0,    0,    0, ...,    0,  373, 2013]])

In [39]:
# split training data into training (75%) and testing sets (25%)
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25)

In [45]:
# building CNN model

model = Sequential()

# add embedding layer for the input sequence
# see: https://towardsdatascience.com/deep-learning-4-embedding-layers-f9a02d55ac12
model.add(Embedding(max_features, 150, input_length=maxlen))

# use SpatialDropout to avoid overfitting
# see: https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/76883
model.add(SpatialDropout1D(0.2))

# CNN
# layer 1
model.add(Conv1D(32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
# layer 2
model.add(Conv1D(64, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
# flatten pooling layer
# see: https://missinglink.ai/guides/deep-learning-frameworks/using-keras-flatten-operation-cnn-models-code-examples/
model.add(Flatten())

# output layer
model.add(Dense(5, activation = 'sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [46]:
# specify epoch and batch size
epochs = 5
batch_size = 32

In [47]:
# classification - use categorical cross entropy as loss function
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# fit the model and train
model.fit(X_train, Y_train, validation_data = (X_val, Y_val), epochs = epochs, batch_size = batch_size, verbose = 1)

# around 0.70 accuracy after 5 epochs

Instructions for updating:
Use tf.cast instead.
Train on 117045 samples, validate on 39015 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x14fa3b70>