In [1]:
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Dropout, Input, Flatten, Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Sequential, Model
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping


import numpy as np

import pandas as pd


In [2]:
# carregar modelo word2vec
word2vec = KeyedVectors.load_word2vec_format('../skip_s100.txt')
vocab_size, embedding_size = word2vec.vectors.shape

In [3]:
# carregar dados de treinamento do CSV
df = pd.read_csv('../train.csv', header=None, names=['texto', 'classe'])
X = df['texto'].values
y = df['classe'].values

In [4]:
# tokenizar o texto
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# padronizar o tamanho das sequencias
max_length = max([len(text) for text in X])
X = pad_sequences(X, maxlen=max_length, padding='post')

In [5]:
embedding_layer = Embedding(vocab_size, embedding_size, weights=[word2vec.vectors], input_length=max_length, trainable=False)
sequence_input = Input(shape=(max_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 6, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(6)(l_cov1)
l_cov2 = Conv1D(128, 2, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(1)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(7, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()


Simplified convolutional neural network
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 18)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 18, 100)           92960600  
_________________________________________________________________
conv1d (Conv1D)              (None, 13, 128)           76928     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 2, 128)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1, 128)            32896     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten (Flatten)    

In [6]:
# separar dados em treinamento e validação
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = to_categorical(y_train, 7)
y_val = to_categorical(y_val, 7)

In [7]:
early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=7,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

In [8]:
# treinar modelo
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping_monitor])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


<keras.callbacks.History at 0x225e1f51fc8>