In [1]:
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
from plot_keras_history import plot_history
import matplotlib.pyplot as plt

In [70]:
class TextCNN(object):
    def __init__(self, classes, config):
        self.models = {}
        self.classes = classes
        self.num_class = len(classes)
        self.config = config
        self.model = self._build()

    def _build(self):
        model = Sequential()
        model.add(Embedding(self.config['vocab_size'], self.config['embedding_dim'], 
                                input_length=self.config['maxlen'],
                                embeddings_initializer="uniform", trainable=True))
        model.add(Conv1D(128, 7, activation='relu',padding='same'))
        model.add(MaxPooling1D())
        model.add(Conv1D(256, 5, activation='relu',padding='same'))
        model.add(MaxPooling1D())
        model.add(Conv1D(512, 3, activation='relu',padding='same'))
        model.add(MaxPooling1D())
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(self.num_class, activation=None))
        model.add(Dense(self.num_class, activation='sigmoid'))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        model.summary()
        return model

    def fit(self, train_x, train_y, validate_x, validate_y):
        history = self.model.fit(train_x, train_y,
                            epochs=self.config['epochs'],
                            verbose=True,
                            validation_data=(validate_x, validate_y),
                            batch_size=self.config['batch_size'])
        return history

In [43]:
import yaml

with open("../config/cnn_config.yaml", 'r') as config_file:
    try:
        config = yaml.safe_load(config_file)
    except yaml.YAMLError as err:
        print(err)

config['training']['embedding_dim'] = 32
config['training']['maxlen'] = 8

In [10]:
import re
import string
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def clean_text(text):
    text = text.strip().lower().replace('\n', '')
    # tokenization
    words = re.split(r'\W+', text)  # or just words = text.split()
    # filter punctuation
    filter_table = str.maketrans('', '', string.punctuation)
    clean_words = [w.translate(filter_table) for w in words if len(w.translate(filter_table))]
    return clean_words

data_df = pd.read_csv('../data/train.csv')
data_df[config['preprocessing']['input_text_column']].fillna("unknown", inplace=True)
X = data_df[config['preprocessing']['input_text_column']].apply(clean_text).values

# 构建词汇表

In [22]:
word2ind = {}
ind2word = {}
specialchars = ['<pad>','<unk>']

def addword(word2ind,ind2word,word):
    if word in word2ind:
        return 
    ind2word[len(word2ind)] = word
    word2ind[word] = len(word2ind)



for one in specialchars:
    addword(word2ind,ind2word,one)
    
for sent in X:
    for word in sent:
        addword(word2ind,ind2word,word)

In [23]:
Y = data_df.drop([config['preprocessing']['input_id_column'], config['preprocessing']['input_text_column']], 1).values

# 把文字改造成id

In [24]:
train_x_ids = []
train_y = Y
for sent in X:
    indsent = [word2ind.get(i,word2ind['<unk>']) for i in sent]
    train_x_ids.append(indsent)

train_x = np.array(train_x_ids)

In [51]:
# Batch对齐：用<pad>补齐所有batch的序列长度
train_x = keras.preprocessing.sequence.pad_sequences(train_x, maxlen=config['training']['maxlen'], padding='post',value=word2ind['<pad>'])

In [52]:
train_x_in, validate_x_in, train_y_in, validate_y_in = train_test_split(
                train_x, train_y,
                test_size=config['preprocessing']['split_ratio'],
                random_state=config['preprocessing']['random_seed'])

In [71]:
config['training']['vocab_size'] = len(word2ind.keys())
textcnn = TextCNN(config['preprocessing']['classes'], config['training'])

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 8, 32)             6739360   
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 8, 128)            28800     
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, 4, 128)            0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 4, 256)            164096    
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 2, 256)            0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 2, 512)            393728    
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 1, 512)          

In [72]:
print(len(train_x_in[0]))
train_x[0]
train_y_in.shape

8


(111699, 6)

In [73]:
history = textcnn.fit(train_x_in, train_y_in, validate_x_in, validate_y_in)



In [78]:
history.history['loss']

[0.15051141381263733]

In [87]:
probs = textcnn.model.predict(train_x_in[0:10])

In [90]:
probs >= 0.5

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])