In [2]:
%matplotlib inline

import pandas as pd
import os
import seaborn as sns


In [3]:
df = pd.read_csv('200_500_str.csv')

In [23]:
try1.shape

(4200, 2)

In [4]:
df= df.drop(['sentences_500_str','sentences_100_str','sentences_500'], axis=1)

In [29]:
df_sample= df.sample(n=5000)

In [5]:
try1=df.loc[df['label']==0].sample(n=300)

In [19]:
try1=try1.append(df.loc[df['label']==1].sample(n=300))

In [21]:
from sklearn.utils import shuffle
try1=shuffle(try1).reset_index(drop=True)

In [22]:
try1.drop(try1[pd.isnull(try1['sentences_200_str'])].index, inplace=True)

In [5]:
#df.drop(df[df['content'].str.len()<30].index, inplace=True)

In [24]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.


In [25]:
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 200


tokenizer = Tokenizer()
tokenizer.fit_on_texts(try1['sentences_200_str'])
sequences = tokenizer.texts_to_sequences(try1['sentences_200_str'])

In [26]:
word_index = tokenizer.word_index

In [28]:
all_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(try1['label']))
print('Shape of data tensor:', all_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (4200, 200)
Shape of label tensor: (4200, 14)


In [30]:
from sklearn.model_selection import train_test_split
x_train_1,x_val_1,y_train_1,y_val_1 = train_test_split(all_data,labels,test_size=0.1,stratify=labels)

In [32]:
x_train_1.shape,y_train_1.shape

((3780, 200), (3780, 14))

In [33]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalMaxPooling1D
from keras.models import Sequential

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))
model.summary()
#plot_model(model, to_file='model.png',show_shapes=True)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
print(model.metrics_names)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 200)          16913000  
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 200)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 198, 250)          150250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 66, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16500)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               3300200   
_________________________________________________________________
dense_2 (Dense)              (None, 14)                2814      
Total para

In [35]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, History 
model_path ='/home/jovyan/jt071-group23/jt071080/cnn_TFIDF_200.h5'

checkpoint = ModelCheckpoint(model_path, monitor='val_acc', save_best_only=True, verbose=1)
earlystop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
history = History()

model.fit(x_train_1, y_train_1,
          validation_data=(x_val_1, y_val_1),
          epochs=20,
          callbacks = [earlystop, checkpoint, history],
          batch_size=128)


Train on 3780 samples, validate on 420 samples
Epoch 1/5
Epoch 00001: val_acc improved from -inf to 0.77857, saving model to /home/jovyan/jt071-group23/jt071080/cnn_TFIDF_200.h5
Epoch 2/5
Epoch 00002: val_acc did not improve
Epoch 3/5
Epoch 00003: val_acc did not improve
Epoch 4/5
Epoch 00004: val_acc did not improve
Epoch 5/5
Epoch 00005: val_acc did not improve


<keras.callbacks.History at 0x7fbc4687c0f0>

In [36]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model

In [37]:
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 50
batch_size = 128


inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding = Embedding(len(word_index)+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(inputs)
reshape = Reshape((MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], EMBEDDING_DIM), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDING_DIM), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDING_DIM), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=labels.shape[1], activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 200)     16913000    input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 200, 200, 1)  0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 198, 1, 512)  307712      reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_2 (

In [39]:
model.fit(x_train_1, y_train_1, 
          batch_size=batch_size, 
          epochs=30, 
          verbose=1, 
          validation_data=(x_val_1, y_val_1))  # starts training


Train on 3780 samples, validate on 420 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fbc41a73470>