### Mount Data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!pip install -r '/content/drive/My Drive/ESUM-NLP-Competition/requirements.txt'

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers

from tokenizers import BertWordPieceTokenizer

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

### AML Classification

In [None]:
data = pd.read_csv('/content/drive/My Drive/ESUM-NLP-Competition/data/all.csv')
data.shape

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(data.news.values, data.IND.values, 
                                                  stratify=data.IND.values, 
                                                  random_state=101, 
                                                  test_size=0.2, shuffle=True)                                                

In [None]:
import transformers
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=192):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []

    text_chunk = [texts]
    encs = tokenizer.encode_batch(text_chunk)
    all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-chinese", from_pt = True)

# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('./vocab.txt', lowercase=False)
fast_tokenizer

In [None]:
from tensorflow.keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def build_model(transformer, max_len=128):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss="binary_crossentropy", metrics=['accuracy',f1_m,precision_m, recall_m])
    
    return model

In [None]:
#IMP DATA FOR CONFIG

AUTO = tf.data.experimental.AUTOTUNE
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)


valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)


In [None]:
EPOCHS = 12
BATCH_SIZE = 16
MAX_LEN = 224

In [None]:
%%time
strategy = tf.distribute.get_strategy()
with strategy.scope():
    transformer_layer = (
        transformers.TFBertModel
        .from_pretrained('bert-base-chinese' ,from_pt = True)
    )
    cls = build_model(transformer_layer, max_len=MAX_LEN)
cls.summary()

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

early_stop = EarlyStopping(patience=3)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=1, mode='auto')

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = cls.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS,
    callbacks=[early_stop, reduce_lr]
)

In [None]:
train_history_2 = cls.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=2
)

In [None]:
cls.save_weights('/content/drive/My Drive/ESUM-NLP-Competition/model/bert-Classification-model-224', save_format='h5')

### Sentence selection

In [None]:
def sents_encode(texts, tokenizer, chunk_size=32, maxlen=64):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in range(0, len(texts), chunk_size):
        text_chunk = texts[i:i+chunk_size]
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
x_df_ = pd.read_csv('/content/drive/My Drive/ESUM-NLP-Competition/data/x_sents_1.csv')

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(x_df_.news.values, x_df_.IND.values, 
                                                  stratify=x_df_.IND.values, 
                                                  random_state=1010, 
                                                  test_size=0.2, shuffle=True)

In [None]:
xtrain = sents_encode(xtrain, fast_tokenizer, maxlen=64)
xvalid = sents_encode(xvalid, fast_tokenizer, maxlen=64)

In [None]:
BATCH_SIZE = 64
AUTO = tf.data.experimental.AUTOTUNE

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((xtrain, ytrain))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)


valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((xvalid, yvalid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

In [None]:
strategy = tf.distribute.get_strategy()
with strategy.scope():
    transformer_layer = (
        transformers.TFBertModel
        .from_pretrained('bert-base-chinese' ,from_pt = True)
    )
    csent = build_model(transformer_layer, max_len=64)
csent.summary()

In [None]:
n_steps = xtrain.shape[0] // BATCH_SIZE
train_history = csent.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS,
    callbacks=[early_stop, reduce_lr]
)

In [None]:
train_history_2 = csent.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=1
)

In [None]:
csent.save_weights('/content/drive/My Drive/ESUM-NLP-Competition/model/Sents-Classification-model-64', save_format='h5')

### BiGRU Model(NER)

In [None]:
import kashgari
from kashgari.tasks.labeling import BiGRU_Model
from kashgari.embeddings import BertEmbedding


SEQUENCE_LENGTH = 100
EPOCHS = 30
EARL_STOPPING_PATIENCE = 5
REDUCE_RL_PATIENCE = 3

BATCH_SIZE = 64
BERT_PATH = '/content/drive/My Drive/ESUM-NLP-Competition/bert-zh/chinese_wwm_ext_L-12_H-768_A-12'
embed = BertEmbedding(BERT_PATH)

In [None]:
from kashgari.corpus import ChineseDailyNerCorpus
import kashgari

train_x, train_y = ChineseDailyNerCorpus.load_data('train')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

In [None]:
model = BiGRU_Model(embed, sequence_length=SEQUENCE_LENGTH)
          
early_stop = keras.callbacks.EarlyStopping(patience=EARL_STOPPING_PATIENCE)
reduse_lr_callback = keras.callbacks.ReduceLROnPlateau(factor=0.1, 
                                                        patience=2)
print(model.sequence_length)
eval_callback = EvalCallBack(kash_model=model,
                              x_data=valid_x, 
                              y_data=valid_y,
                              truncating=True,
                              step=2)

tf_board = keras.callbacks.TensorBoard(
    log_dir=os.path.join(TF_LOG_FOLDER, run_name), 
    update_freq=1000
)

callbacks = [early_stop, reduse_lr_callback, eval_callback, tf_board]

model.build_model(train_x, train_y)
model.compile_model(optimizer=Adam(lr=1e-5))
model.fit(train_x, train_y, valid_x, valid_y,
          callbacks=callbacks,
          epochs=EPOCHS)

In [None]:
model.save("/content/drive/My Drive//ESUM-NLP-Competition/model/Bert-Chinese_BiGRU_Model-100")