In [1]:
import numpy as np
import pandas as pd



In [2]:
content_df = pd.read_csv('./data/Train_Dataset.csv')
label_df = pd.read_csv('./data/Train_Dataset_Label.csv')

In [3]:
df = content_df.merge(label_df, on='id')
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
print(df[df['label'] == 2].shape, df[df['label'] == 1].shape, df[df['label'] == 0].shape)

(2931, 4) (3646, 4) (763, 4)


In [14]:
df = df.dropna()

In [15]:
config_path = './bert_model/bert_config.json'
checkpoint_path = './bert_model/bert_model.ckpt'
vocab_path = './bert_model/vocab.txt'

In [8]:
import codecs
from keras_bert import load_trained_model_from_checkpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

In [10]:
SEQ_LEN = 256
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=SEQ_LEN,
)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [11]:
import os
import numpy as np
from tqdm import tqdm
from keras_bert import Tokenizer

In [12]:
tokenizer = Tokenizer(token_dict)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R

tokenizer = OurTokenizer(token_dict)

In [15]:
BATCH_SIZE = 32
indices, sentiments = [], []
data = df[['content', 'label']].values
for i in range(data.shape[0]):
    text = data[i][0]
    sentiment = data[i][1]
    ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
    indices.append(ids)
    sentiments.append(sentiment)
items = list(zip(indices, sentiments))
indices, sentiments = zip(*items)
indices = np.array(indices)
mod = indices.shape[0] % BATCH_SIZE
if mod > 0:
    indices, sentiments = indices[:-mod], sentiments[:-mod]

In [16]:
X = [indices, np.zeros_like(indices)]
y = np.array(sentiments)

In [17]:
LR = 1e-4
import keras
inputs = model.inputs[:2]
dense = model.get_layer('NSP-Dense').output
outputs = keras.layers.Dense(units=3, activation='softmax')(dense)
model = keras.models.Model(inputs, outputs)
model.compile(
    keras.optimizers.Adam(lr=LR),
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

In [18]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = keras.utils.np_utils.to_categorical(y)

model.fit(
    X,
    y,
    epochs=1,
    batch_size=BATCH_SIZE,
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/1
1152/7264 [===>..........................] - ETA: 1:55:50 - loss: 0.9420 - acc: 0.5920

KeyboardInterrupt: 

In [19]:
test_text = ['超级好用，真的喜欢', '一点效果都没有']
indices = []
for text in test_text:
    ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
    indices.append(ids)
indices = np.array(indices)
X_test = [indices, np.zeros_like(indices)]

In [20]:
model.predict(X_test)

array([[0.11966151, 0.8803385 ],
       [0.13234515, 0.8676548 ]], dtype=float32)