In [2]:
import numpy as np 
import pandas as pd

import os

import tokenization
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [3]:
hub.__version__

'0.8.0'

In [4]:
train_data = pd.read_csv('train_40k.csv', encoding='latin-1')
test_data = pd.read_csv('val_10k.csv', encoding='latin-1')
train_data.shape

(40000, 10)

In [5]:
train_data.head()

Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3
0,B000E46LYG,Golden Valley Natural Buffalo Jerky,A3MQDNGHDJU4MK,0/0,3.0,-1,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,B000GRA6N8,Westing Game,unknown,0/0,5.0,860630400,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,B000GRA6N8,Westing Game,unknown,0/0,5.0,883008000,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown
3,B000GRA6N8,Westing Game,unknown,0/0,5.0,897696000,I got the book at my bookfair at school lookin...,toys games,games,unknown
4,B00000DMDQ,I SPY A is For Jigsaw Puzzle 63pc,unknown,2/4,5.0,911865600,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,jigsaw puzzles


In [6]:
label = preprocessing.LabelEncoder()
y = label.fit_transform(train_data['Score'])
y = to_categorical(y)

ytest = label.transform(test_data['Score'])
ytest = to_categorical(ytest)
print(y[:5])

[[0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [7]:
input_cat = preprocessing.LabelEncoder()
cat_x = input_cat.fit_transform(train_data['Cat1'])
cat_x = to_categorical(cat_x)

cat_xtest = input_cat.transform(test_data['Cat1'])
cat_xtest = to_categorical(cat_xtest)

print(cat_x[:5])
print(cat_x.shape)

[[0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]
(40000, 6)


In [8]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=True)

In [9]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [10]:
def build_model(bert_layer,Cat_X, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    Input_cat_X = tf.keras.Input(shape=(cat_x.shape[1],),dtype=tf.float32,name="Input_cat_X")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    

    clf_output = sequence_output[:, 0, :] # keeping only the CLS term

    merge = tf.keras.layers.concatenate([clf_output,Input_cat_X],axis=1)


    lay = tf.keras.layers.Dense(256, activation='relu')(merge)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(128, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(5, activation='softmax')(lay)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids,Input_cat_X], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
max_len = 60
train_input = bert_encode(train_data.Text.values, tokenizer, max_len=max_len)
test_input = bert_encode(test_data.Text.values, tokenizer, max_len=max_len)
train_labels = y
test_label = ytest

In [12]:
labels = label.classes_
print(labels)

[1. 2. 3. 4. 5.]


In [13]:
model = build_model(bert_layer,cat_x, max_len=max_len)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 60)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 60)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 60)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             



In [14]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('modelreview.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2, verbose=1)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs", histogram_freq=1)

l = list(train_input)
l.append(cat_x)

train_sh = model.fit(
    l, train_labels,     validation_split=0.2,  #callbacks=[checkpoint, earlystopping,tensorboard_callback], 
    epochs=2,
    batch_size=16,
)

Epoch 1/2
Epoch 2/2


In [15]:
ltest = list(test_input)
ltest.append(cat_xtest)
import keras
print(model.evaluate(ltest,ytest))

#custom_objects = {"KerasLayer": bert_layer}
#model_load = keras.models.load_model("modelreview.h5",custom_objects)
#model_load.evaluate(ltest,ytest)

[0.746426522731781, 0.7206000089645386]
