## 连接 colab drive

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
cd /content/drive/My Drive/NLP_study/classifier_study/

In [0]:
!ls

# 模型训练模块

## 导入相关库

In [0]:
# 导入相应的库
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import datetime
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Reshape, Flatten, Concatenate, concatenate,Dropout, SpatialDropout1D, CuDNNLSTM, CuDNNGRU
from keras.layers import GlobalMaxPooling1D, MaxPooling1D, Add, Flatten,GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import os
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.optimizers import Adam
from keras.models import load_model
import keras
import nltk
import string
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import pickle

## 数据导入

In [0]:
# 计时开始
starttime = datetime.datetime.now()

data_path = 'data/'
train_path = data_path+'train_clean.csv'
emb_path = data_path + 'glove.6B.50d.txt'
model_path = 'model/'

# 加载训练集和测试集数据
train = pd.read_csv(train_path).fillna(' ')#[0:5000]
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
text_name = 'clean'
# 保存结果到csv文件中
max_features = 100000 # 最大特征数，现有数据中所有不同单词的种数
maxlen = 100 # 一条评论的词种类数的最大限制
embed_size = 50 # 预训练词向量的维度
batch_size = 128
epochs = 100
num_filters = 32 # 过滤器数
rnn_type = 'GRU'
tokenizer_path = model_path+'tokenizer.pkl'
train_size = 0.7  # 训练集 和 测试集 分割比例

## 函数

In [0]:
def open_file(fname):
    embeddings_index = {}
    with open(fname,encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def getTrainTokenizer(train,text_name,max_features,maxlen,tokenizer_path):
    '''
        功能：词向量加载
        :param train: dataframe 训练集
        :param text_name: string 训练集中 文本数据 的 列名
        :param maxlen: int， 评论中的词种类数的最大限制
        :param max_features: int  最大特征数，现有数据中所有不同单词的种数
        :param tokenizer_path: string tokenizer 保存地址
        :return 
          X_train:  matrix      训练数据序列化
          tokenizer: object tokenizer 
    '''
    X_train = train[text_name].values
    ## 将评论数据，转转成sequences形式，评论中英文单词类别数最大为200 
    # 分词器
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train))
    # 将评论数据转换成sequences，[1,2,3]
    X_train = tokenizer.texts_to_sequences(X_train)
    # 统一长度
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    with open(tokenizer_path,'wb') as f:
    pickle.dump(tokenizer, f)  #模型保存
    return X_train,tokenizer


def getEmbeddingMatrix(fname,tokenizer,embed_size):
    '''
    功能：词向量加载
    :param fname: string 词向量路径
    :param embed_size: int 预训练词向量的维度
    :param tokenizer: object tokenizer 
    :return 
      embedding_matrix: matrix  词向量加载
    '''
    embeddings_index = open_file(fname)
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((num_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


class RocAucEvaluation(Callback):
    """
       功能： Callback子类，用于打印ROC-AUC分数 
    """
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

## 数据加载

In [0]:
print('star CNN...')
# 数据
Y_train = train[class_names].values
X_train,tokenizer = getTrainTokenizer(train,text_name,max_features,maxlen,tokenizer_path)
embedding_matrix = getEmbeddingMatrix(emb_path,tokenizer,embed_size)

## Basemodel 模型

In [0]:
class Basemodel():
    '''
    所有模型父类，该模型用于保存其他模型共性方法
    '''
    def __init__(self,maxlen,max_features,embed_size,embedding_matrix,model_save_path):
        '''
          功能：模型初始化
          :param maxlen: int， 评论中的词种类数的最大限制
          :param max_features: int，  最大特征数，现有数据中所有不同单词的种数
          :param embed_size: int， 预训练词向量的维度
          :param embedding_matrix: matrix， 词向量矩阵
          :param model_save_path: string， 模型存储路径
        '''
        self.maxlen = maxlen
        self.max_features = max_features
        self.embed_size = embed_size
        self.embedding_matrix = embedding_matrix
        self.model_save_path = model_save_path
        self.dropout = 0.5
        self.optimizer = 'adam'
        self.ouyput = 6

    def build_model(self):
        '''
          功能：模型构建，每个模型都不一样
        '''
        pass
  
    def train(self,X_train,Y_train,train_size,batch_size,epochs):
        '''
          功能：模型训练
          :param X_train: matrix， 训练集 X
          :param Y_train: matrix， 训练集 标签
          :param train_size: float， 训练集比例
          :param batch_size: int， batch 大小
          :param epochs: int 循环次数
        '''
        # 拆分训练集和验证集
        x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, train_size=train_size)  
        self.model.compile(loss='binary_crossentropy',
                  optimizer=self.optimizer,
                  metrics=['accuracy'])
        # 模型评估
        RocAuc = RocAucEvaluation(validation_data=(x_val, y_val), interval=1)
        early_stopping=keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,
                                  patience=0, verbose=0, mode='auto',
                                  baseline=None, restore_best_weights=False)
        # 训练
        # verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录
        hist = self.model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
                    validation_data=(x_val, y_val),
                    callbacks=[RocAuc,early_stopping], verbose=2)
        self.model.save(self.model_save_path) 


    def predict(self,X_test):
        '''
          功能：模型预测
        '''
        model = load_model(self.model_save_path)
        y_pred = model.predict(X_test)
        return y_pred
  
  

## Text-CNN 方法

### 模型构建

In [0]:
class TextCNN(Basemodel):
    def build_model(self):
        inp = Input(shape=(self.maxlen, ))
        # 引入预训练词向量，向量化输入的int，得到max_features * embed_size的矩阵
        x = Embedding(self.max_features, self.embed_size, weights=[self.embedding_matrix])(inp)
        # 随机丢弃词，提高训练速度，提高词的独立性
        x = SpatialDropout1D(0.2)(x)
        # 转换维度，添加第三维，维度是1
        x = Reshape((maxlen, embed_size, 1))(x)
        # 卷积层，过滤器32,大小1*300
        conv_1 = Conv2D(num_filters, kernel_size=(1, embed_size), kernel_initializer='normal',activation='elu')(x)
        conv_2 = Conv2D(num_filters, kernel_size=(2, embed_size), kernel_initializer='normal',activation='elu')(x)
        conv_3 = Conv2D(num_filters, kernel_size=(3, embed_size), kernel_initializer='normal',activation='elu')(x)
        conv_5 = Conv2D(num_filters, kernel_size=(5, embed_size), kernel_initializer='normal',activation='elu')(x)
        # 最大池化层
        maxpool_1 = MaxPool2D(pool_size=(maxlen, 1))(conv_1)
        maxpool_2 = MaxPool2D(pool_size=(maxlen - 1, 1))(conv_2)
        maxpool_3 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_3)
        maxpool_5 = MaxPool2D(pool_size=(maxlen - 4, 1))(conv_5)
        # 连接最大池化层
        z = Concatenate(axis=1)([maxpool_1, maxpool_2,maxpool_3,maxpool_5])   
        # 压平
        z = Flatten()(z)
        # 随机丢弃，提高最后训练速度，防止因全连接层导致过拟合
        z = Dropout(self.dropout)(z)
        # 全连接层，输入六个值
        outp = Dense(self.ouyput, activation="sigmoid")(z)
        self.model = Model(inputs=inp, outputs=outp)
        self.model.summary(120)
  

### 模型训练

In [0]:
text_cnn_model = TextCNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'cnn_model.h5')
text_cnn_model.build_model()
text_cnn_model.train(X_train,Y_train,train_size,batch_size,epochs)

## Text-RNN 方法

### 模型构建

In [0]:
class TextRNN(Basemodel):
    def build_model(self,rnn_type):
        rnn_type_dict = {
          'LSTM': LSTM,
          'GRU': GRU,
          'CuDNNLSTM': CuDNNLSTM,
          'CuDNNGRU': CuDNNGRU,
        }
        if rnn_type in rnn_type_dict:
            layer_cell = rnn_type_dict[rnn_type]
        else:
            layer_cell = rnn_type_dict['GRU']
        inp = Input(shape=(self.maxlen, ))
        # 引入预训练词向量，向量化输入的int，得到max_features * embed_size的矩阵
        x = Embedding(self.max_features, self.embed_size, weights=[self.embedding_matrix])(inp)
        # 随机丢弃词，提高训练速度，提高词的独立性
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(layer_cell(128, return_sequences=True,dropout=self.dropout,recurrent_dropout=0.1))(x)
        x = Dropout(self.dropout)(x)
        x = Flatten()(x)
        preds = Dense(self.ouyput, activation="sigmoid")(x)
        self.model = Model(inp, preds)
        self.model.summary(120)
    

### 模型训练

In [0]:
# X_train,Y_train,train_size,maxlen,max_features,embed_size,embedding_matrix,model_save_path,batch_size,epochs
text_rnn_model = TextRNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'rnn_model.h5')
text_rnn_model.build_model('GRU')
text_rnn_model.train(X_train,Y_train,train_size,batch_size,epochs)

## Text-RCNN 方法

### 模型构建

In [0]:
class TextRCNN(Basemodel):
    def build_model(self,rnn_type):
        rnn_type_dict = {
          'LSTM': LSTM,
          'GRU': GRU,
          'CuDNNLSTM': CuDNNLSTM,
          'CuDNNGRU': CuDNNGRU,
        }
        if rnn_type in rnn_type_dict:
            layer_cell = rnn_type_dict[rnn_type]
        else:
            layer_cell = rnn_type_dict['GRU']
        inp = Input(shape=(self.maxlen, ))
        # 引入预训练词向量，向量化输入的int，得到max_features * embed_size的矩阵
        x = Embedding(self.max_features, self.embed_size, weights=[self.embedding_matrix])(inp)
        # 随机丢弃词，提高训练速度，提高词的独立性
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(layer_cell(128, return_sequences=True,dropout=self.dropout,recurrent_dropout=0.1))(x)
        x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        x = concatenate([avg_pool, max_pool]) 
        preds = Dense(self.ouyput, activation="sigmoid")(x)
        self.model = Model(inp, preds)
        self.model.summary(120)
    

### 模型预测

In [0]:
# X_train,Y_train,train_size,maxlen,max_features,embed_size,embedding_matrix,model_save_path,batch_size,epochs
text_rcnn_model = TextRCNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'rcnn_model.h5')
text_rcnn_model.build_model('GRU')
text_rcnn_model.train(X_train,Y_train,train_size,batch_size,epochs)

## 预测

### 函数

In [0]:
def text_process(text):
    '''
        功能：文本预处理
        :param text: string 文本
        :return
         cleaned_text:  string 处理后文本
         cleaned_text_len: int 文本长度
    '''
    # 大小写转换
    lower = text.lower()
    # 标点符号的处理
    #string.punctuation中包含英文的标点，我们将其放在待去除变量remove中
    #函数需要三个参数，前两个表示字符的映射，我们是不需要的。
    remove = str.maketrans('','',string.punctuation) 
    without_punctuation = lower.translate(remove)
    # 分词 
    tokens = nltk.word_tokenize(without_punctuation)
    # 去除停用词
    without_stopwords = [w for w in tokens if not w in stopwords.words('english')]
    # 提取词干
    s = nltk.stem.SnowballStemmer('english')  #参数是选择的语言
    cleaned_text = [s.stem(ws) for ws in without_stopwords]
    cleaned_text_len = len(cleaned_text)
    cleaned_text = ' '.join(cleaned_text)
    return cleaned_text,cleaned_text_len

def getTestTokenizer(test,maxlen,tokenizer_path):
    '''
        功能：词向量加载
        :param test: string test 测试集
        :param maxlen: int， 评论中的词种类数的最大限制
        :param tokenizer_path: string tokenizer 保存地址
        :return 
          X_test:  matrix      测试数据序列化
          tokenizer: object tokenizer 
    '''
    # 分词器
    with open(tokenizer_path,'rb') as f:
        tokenizer = pickle.load(f) #模型载入
    # 将评论数据转换成sequences，[1,2,3]
    X_test = tokenizer.texts_to_sequences(test)
    # 统一长度
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    return X_test,tokenizer

threshold = 0.8 # 阙值 设定
def model_fusion(threshold,text_cnn_y_pred,text_rnn_y_pred,text_rcnn_y_pred,class_names):
    '''
        功能：模型融合
        :param threshold: float 阙值 设定
        :param text_cnn_y_pred: list  text_cnn 预测结果
        :param text_rnn_y_pred: list  text_rnn 预测结果
        :param text_rcnn_y_pred: list text_rcnn 预测结果
        :param class_names: list 类别列表
        :return 
              result:  list      模型融合后的结果
    '''
    result = text_rnn_y_pred.copy()
    for c in class_names:
        result[c] = (3* text_cnn_y_pred[c] + 3 * text_rnn_y_pred[c] + 4 * text_rcnn_y_pred[c]) / 10
        result.loc[result[c] > threshold, c] = 1
        result.loc[result[c] <= threshold, c] = 0
    return result

### 行预测

In [0]:
test = "Fuck you, block me, you faggot pussy!"
test,test_len = text_process(test)
test = [test]
X_test,tokenizer = getTestTokenizer(test,maxlen,tokenizer_path)
embedding_matrix = getEmbeddingMatrix(emb_path,tokenizer,embed_size)

text_cnn_model = TextCNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'cnn_model.h5')
text_cnn_y_pred = pd.DataFrame(text_cnn_model.pred_line(X_test))
text_cnn_y_pred.columns = class_names
print(text_cnn_y_pred)

text_rnn_model = TextRNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'rnn_model.h5')
text_rnn_y_pred = pd.DataFrame(text_rnn_model.pred_line(X_test))
text_rnn_y_pred.columns = class_names
print(text_rnn_y_pred)

text_rcnn_model = TextRCNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'rcnn_model.h5')
text_rcnn_y_pred = pd.DataFrame(text_rcnn_model.pred_line(X_test))
text_rcnn_y_pred.columns = class_names
print(text_rcnn_y_pred)


In [0]:
result = model_fusion(threshold,text_cnn_y_pred,text_rnn_y_pred,text_rcnn_y_pred,class_names)
result

### 批量预测

In [0]:
# 加载训练集和测试集数据
test = pd.read_csv(data_path+'train.csv').fillna(' ')[0:50]
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
text_name = 'comment_text'

In [0]:
len(test)

In [0]:
test['clean'],test['sent_len'] = zip(*test[text_name].apply(text_process)) 
test = test['clean']

In [0]:
X_test,tokenizer = getTestTokenizer(test,maxlen,tokenizer_path)
embedding_matrix = getEmbeddingMatrix(emb_path,tokenizer,embed_size)

text_cnn_model = TextCNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'cnn_model.h5')
text_cnn_y_pred = pd.DataFrame(text_cnn_model.predict(X_test))
text_cnn_y_pred.columns = class_names
# print(text_cnn_y_pred)

text_rnn_model = TextRNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'rnn_model.h5')
text_rnn_y_pred = pd.DataFrame(text_rnn_model.predict(X_test))
text_rnn_y_pred.columns = class_names
# print(text_rnn_y_pred)

text_rcnn_model = TextRCNN(maxlen,max_features,embed_size,embedding_matrix,model_path+'rcnn_model.h5')
text_rcnn_y_pred = pd.DataFrame(text_rcnn_model.predict(X_test))
text_rcnn_y_pred.columns = class_names
# print(text_rcnn_y_pred)

In [0]:
result = model_fusion(threshold,text_cnn_y_pred,text_rnn_y_pred,text_rcnn_y_pred,class_names)
result