In [17]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
import keras
import pandas as pd
import numpy as np
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,roc_auc_score,f1_score
from tqdm import tqdm

In [2]:
train= pd.read_csv('./nlp-getting-started/train.csv')
test=pd.read_csv('./nlp-getting-started/test.csv')
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, train.target.values, 
                                                  stratify=train.target.values, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)
xtest=test.text.values

print(xtrain.shape, xvalid.shape, ytrain.shape, yvalid.shape,xtest.shape)

(6851,) (762,) (6851,) (762,) (3263,)


In [4]:
import os
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [37]:
from keras_bert import load_trained_model_from_checkpoint
model = load_trained_model_from_checkpoint(
      config_path,
      checkpoint_path,
      training=True,
      trainable=True,
      seq_len=SEQ_LEN,
  )
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 128)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 128)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 128, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 128, 768)     1536        Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

__________________________________________________________________________________________________
Encoder-4-FeedForward (FeedForw (None, 128, 768)     4722432     Encoder-4-MultiHeadSelfAttention-
__________________________________________________________________________________________________
Encoder-4-FeedForward-Dropout ( (None, 128, 768)     0           Encoder-4-FeedForward[0][0]      
__________________________________________________________________________________________________
Encoder-4-FeedForward-Add (Add) (None, 128, 768)     0           Encoder-4-MultiHeadSelfAttention-
                                                                 Encoder-4-FeedForward-Dropout[0][
__________________________________________________________________________________________________
Encoder-4-FeedForward-Norm (Lay (None, 128, 768)     1536        Encoder-4-FeedForward-Add[0][0]  
__________________________________________________________________________________________________
Encoder-5-

Encoder-9-MultiHeadSelfAttentio (None, 128, 768)     2362368     Encoder-8-FeedForward-Norm[0][0] 
__________________________________________________________________________________________________
Encoder-9-MultiHeadSelfAttentio (None, 128, 768)     0           Encoder-9-MultiHeadSelfAttention[
__________________________________________________________________________________________________
Encoder-9-MultiHeadSelfAttentio (None, 128, 768)     0           Encoder-8-FeedForward-Norm[0][0] 
                                                                 Encoder-9-MultiHeadSelfAttention-
__________________________________________________________________________________________________
Encoder-9-MultiHeadSelfAttentio (None, 128, 768)     1536        Encoder-9-MultiHeadSelfAttention-
__________________________________________________________________________________________________
Encoder-9-FeedForward (FeedForw (None, 128, 768)     4722432     Encoder-9-MultiHeadSelfAttention-
__________

__________________________________________________________________________________________________
NSP-Dense (Dense)               (None, 768)          590592      Extract[0][0]                    
__________________________________________________________________________________________________
MLM (Masked)                    (None, 128, 30522)   0           MLM-Sim[0][0]                    
                                                                 Input-Masked[0][0]               
__________________________________________________________________________________________________
NSP (Dense)                     (None, 2)            1538        NSP-Dense[0][0]                  
Total params: 109,811,516
Trainable params: 109,811,516
Non-trainable params: 0
__________________________________________________________________________________________________


In [38]:
from keras import layers
import keras
from keras.layers import Dense, Input
from keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import ModelCheckpoint
# import tensorflow_hub as hub
import tokenization
inputs = model.inputs[:2]
dense = model.get_layer('NSP-Dense').output
outputs = keras.layers.Dense(units=20, activation='softmax')(dense)
x = layers.Dense(300, activation="relu")(outputs)
x = layers.Dropout(0.3)(x)
x = layers.BatchNormalization()(x)

x = layers.Dense(300, activation="relu")(x)
x = layers.Dropout(0.3)(x)
x = layers.BatchNormalization()(x)

y = layers.Dense(2, activation="softmax")(x)

model = keras.models.Model(inputs, y)

model.compile(Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

In [8]:
import pandas as pd
train= pd.read_csv('./nlp-getting-started/train.csv')
test=pd.read_csv('./nlp-getting-started/test.csv')

In [9]:
from keras_bert import Tokenizer
import codecs
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

In [12]:
tokenizer = Tokenizer(token_dict)
tokenizer.encode("hello there")

([101, 7592, 2045, 102], [0, 0, 0, 0])

In [22]:
def convert_data(data_df,DATA_COLUMN="text",LABEL_COLUMN="target",test_mode=1):
    global tokenizer
    indices, targets = [], []
    for i in tqdm(range(len(data_df))):
        ids, segments = tokenizer.encode(data_df[DATA_COLUMN][i], max_len=SEQ_LEN)
        indices.append(ids)
        if test_mode:
            targets.append(data_df[LABEL_COLUMN][i])
        else:
            targets.append(-1)
    items = list(zip(indices, targets))
    np.random.shuffle(items)
    indices, targets = zip(*items)
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)], np.array(targets)

In [30]:
inputs

[<tf.Tensor 'Input-Token:0' shape=(?, 128) dtype=float32>,
 <tf.Tensor 'Input-Segment:0' shape=(?, 128) dtype=float32>]

In [23]:
train_x, train_y = convert_data(train)
test_x, test_y = convert_data(test,test_mode=None)

100%|████████████████████████████████████████████████████████████████████████████| 7613/7613 [00:05<00:00, 1422.90it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:02<00:00, 1517.58it/s]


In [36]:
import keras
ytrain_enc = keras.utils.to_categorical(train_y)


In [39]:
for i,layer in enumerate(model.layers):
    print(i,layer.name)
    if ((i>2)&(i<=105)):
        layer.trainable=False

0 Input-Token
1 Input-Segment
2 Embedding-Token
3 Embedding-Segment
4 Embedding-Token-Segment
5 Embedding-Position
6 Embedding-Dropout
7 Embedding-Norm
8 Encoder-1-MultiHeadSelfAttention
9 Encoder-1-MultiHeadSelfAttention-Dropout
10 Encoder-1-MultiHeadSelfAttention-Add
11 Encoder-1-MultiHeadSelfAttention-Norm
12 Encoder-1-FeedForward
13 Encoder-1-FeedForward-Dropout
14 Encoder-1-FeedForward-Add
15 Encoder-1-FeedForward-Norm
16 Encoder-2-MultiHeadSelfAttention
17 Encoder-2-MultiHeadSelfAttention-Dropout
18 Encoder-2-MultiHeadSelfAttention-Add
19 Encoder-2-MultiHeadSelfAttention-Norm
20 Encoder-2-FeedForward
21 Encoder-2-FeedForward-Dropout
22 Encoder-2-FeedForward-Add
23 Encoder-2-FeedForward-Norm
24 Encoder-3-MultiHeadSelfAttention
25 Encoder-3-MultiHeadSelfAttention-Dropout
26 Encoder-3-MultiHeadSelfAttention-Add
27 Encoder-3-MultiHeadSelfAttention-Norm
28 Encoder-3-FeedForward
29 Encoder-3-FeedForward-Dropout
30 Encoder-3-FeedForward-Add
31 Encoder-3-FeedForward-Norm
32 Encoder-4-Mul

In [40]:
# for layer in model.layers:
#     print(layer.name,layer.trainable)

In [41]:
SEQ_LEN = 128
BATCH_SIZE = 32
EPOCHS = 100
LR = 1e-4

In [None]:
model.fit(
        train_x,
        ytrain_enc,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.3,
        callbacks=[earlystop]
    )

Train on 5329 samples, validate on 2284 samples
Epoch 1/100
 224/5329 [>.............................] - ETA: 4:53:12 - loss: 0.9527 - acc: 0.59 - ETA: 4:35:13 - loss: 0.8987 - acc: 0.59 - ETA: 4:36:48 - loss: 1.0263 - acc: 0.55 - ETA: 4:38:21 - loss: 0.9822 - acc: 0.55 - ETA: 4:36:08 - loss: 1.0085 - acc: 0.53 - ETA: 4:31:17 - loss: 1.0021 - acc: 0.53 - ETA: 4:23:34 - loss: 1.0074 - acc: 0.5179

In [None]:
from sklearn.metrics import roc_auc_score
predictions = model.predict(test_x)
# print(classification_report(predictions,yvalid))
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format("GRU",score))