In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

tf.get_logger().setLevel('ERROR')
from sklearn.model_selection import train_test_split
ia_data = pd.read_csv('ia_prep.csv')
rt_data = pd.read_csv('rt_prep.csv')

ia_data['target'] = [1]*ia_data.shape[0]
rt_data['target'] = [0]*rt_data.shape[0]

train_data = pd.concat([rt_data[500:], ia_data[500:]])
test_data = pd.concat([rt_data[:500], ia_data[:500]])

In [2]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_preprocess/1")
    encoder_inputs = preprocessor(text_input)

    encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_L-12_H-768_A-12/1", trainable=True)
    encoder_outputs = encoder(encoder_inputs)
    net = encoder_outputs["pooled_output"]      # [batch_size, 768].
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)
model = build_classifier_model()

In [3]:
model.load_weights('models/bert3/cp-0010.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2c465022110>

In [4]:
y_pred = model.predict(test_data.text_cleaned.values, batch_size=16)

import numpy as np
def sigmoid(x):
    sig = 1 / (1 + np.exp(-x))
    return sig

y_pred_proba = sigmoid(y_pred)
y_test = test_data.target.values
y_pred_class = (y_pred > 0) * 1

print('accuracy: {:.3f}, f1_score: {:.3f}, roc_auc_score: {:.3f}'.format(accuracy_score(y_test, y_pred_class), f1_score(y_test, y_pred_class), roc_auc_score(y_test, y_pred_proba)))

accuracy: 0.899, f1_score: 0.897, roc_auc_score: 0.916


In [5]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['input_1[0][0]']                
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [6]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x2c40e8cf790>,
 <tensorflow_hub.keras_layer.KerasLayer at 0x2c40ece5390>,
 <tensorflow_hub.keras_layer.KerasLayer at 0x2c40f282500>,
 <keras.layers.core.dropout.Dropout at 0x2c649016230>,
 <keras.layers.core.dense.Dense at 0x2c6492e3f40>]

In [7]:
new_model = tf.keras.Model(model.input, model.layers[4].input)

In [8]:
%%time
train_data_features = new_model.predict(train_data.text_cleaned.values, batch_size=16)
test_data_features = new_model.predict(test_data.text_cleaned.values, batch_size=16)

CPU times: total: 2min 55s
Wall time: 2min 11s


In [9]:
train_data_features.shape, test_data_features.shape

((8613, 768), (1000, 768))

In [10]:
pd.concat([train_data.reset_index(), pd.DataFrame(train_data_features)], axis=1).drop(columns=['index', 'Unnamed: 0']).to_csv('train_data_features.csv')
pd.concat([test_data.reset_index(), pd.DataFrame(test_data_features)], axis=1).drop(columns=['index', 'Unnamed: 0']).to_csv('test_data_features.csv')