In [1]:
%matplotlib inline

import re
import pandas as pd
import os


In [2]:
df = pd.read_csv('data/data_uniqcontent.csv')

In [3]:
df.drop(df[pd.isnull(df['sentences_1000_str'])].index, inplace=True)

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.


In [5]:
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 200


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['sentences_1000_str'])
sequences = tokenizer.texts_to_sequences(df['sentences_1000_str'])


In [6]:
word_index = tokenizer.word_index

In [7]:
all_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(df['class_no']))
print('Shape of data tensor:', all_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (81649, 1000)
Shape of label tensor: (81649, 14)


In [8]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val = train_test_split(all_data,labels,test_size=0.3,stratify=labels)

In [9]:
x_train.shape,y_train.shape

((57154, 1000), (57154, 14))

In [10]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, Embedding
from keras.models import Sequential

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, 
          input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(labels.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
print(model.metrics_names)
model.summary()

['loss', 'acc']
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 200)         139376400 
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 14)                2814      
Total params: 139,700,014
Trainable params: 139,700,014
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)

  num_elements)


Train on 57154 samples, validate on 24495 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7faf426ea240>