In [1]:
# bert reference link
# https://medium.com/@aieeshashafique/feature-extraction-from-bert-25887ed2152a
# https://github.com/AyeshaShafique/bert-feature-extraction-tf-2.0/blob/master/bert_embeddings_with_tensorflow_2_0.ipynb
# https://colab.research.google.com/drive/1hMLd5-r82FrnFnBub-B-fVW78Px4KPX1#scrollTo=Ik3xqHqXM_lN

In [2]:
#!pip install tensorflow==2.0
#!pip install tensorflow_hub #0.8.0
#!pip install bert-for-tf2

In [3]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.models import Model 
import bert
import pandas as pd
import numpy as np

In [4]:
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

TF version:  2.0.0
Hub version:  0.8.0


In [5]:
max_seq_length = 256
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
 name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
 name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
 name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
 trainable=False)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [6]:
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

In [7]:
# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [8]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
#https://github.com/google-research/bert/blob/master/tokenization.py

In [9]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [11]:
#source file
data_string = '20171001-20200430'
path_news = './data/news_{}.gzip'.format(data_string)
path_price = './data/price_{}.gzip'.format(data_string)
df_news = pd.read_csv(path_news,compression='gzip',index_col = 0)
df_price = pd.read_csv(path_price,compression='gzip',index_col=0)

In [18]:
df_pastnews = pd.read_csv('./bert_output.csv')

In [34]:
# check the max len of the whole thing
checklen = []
for i in np.arange(df_news.shape[0]):

    s1 = df_news.loc[i,'title']
    s2 = df_news.loc[i,'description']
    if type(s1)!= str: 
        s1=""
    if type(s2)!= str: 
        s2=""
            
    
    stokens1 = tokenizer.tokenize(s1)
    stokens2 = tokenizer.tokenize(s2)
    stokens = ["[CLS]"] + stokens1 + ["[SEP]"]+ stokens2 + ["[SEP]"]
    checklen.append(len(stokens))
    
print(np.max(checklen))

234


In [46]:
df_bert_output = pd.DataFrame(columns = ['title']+['output_'+str(i) for i in np.arange(768)])

In [10]:
#s = "Hi we are using BERT"
#s = "I'm testing the model with adfadfasdfafd wrong words"

for i in np.arange(df_news.shape[0]):

    s1 = df_news.loc[i,'title']
    s2 = df_news.loc[i,'description']
    if type(s1)!= str: 
        s1=""
    if type(s2)!= str: 
        s2=""
            
    stokens1 = tokenizer.tokenize(s1)
    stokens2 = tokenizer.tokenize(s2)
    stokens = ["[CLS]"] + stokens1 + ["[SEP]"]+ stokens2 + ["[SEP]"]
    
    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)
    pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
    df_bert_output.loc[i,'title'] = s1 
    df_bert_output.loc[i, 1:] = pool_embs[0]

In [48]:
df_bert_output.to_csv('bert_output{}.gzip'.format(data_string),index=False,compression='gzip')

In [49]:
df_bert_output.to_csv('bert_output{}.csv'.format(data_string),index=False)

## for testing

In [47]:
#s = "Hi we are using BERT"
#s = "I'm testing the model with adfadfasdfafd wrong words"

for i in np.arange(df_news.shape[0]):
    print("\r {} out of {}".format(i,len(df_news)),end="")
    s1 = df_news.loc[i,'title']
    s2 = df_news.loc[i,'description']
    if type(s1)!= str: 
        s1=""
    if type(s2)!= str: 
        s2=""
            
    if ((df_pastnews['title']==s1).any()==True) and s1!= "":
        df_bert_output.loc[i,'title'] = s1 
        df_bert_output.loc[i, 1:] = df_pastnews[df_pastnews['title']==s1].values[0][1:]
    else:
        stokens1 = tokenizer.tokenize(s1)
        stokens2 = tokenizer.tokenize(s2)
        stokens = ["[CLS]"] + stokens1 + ["[SEP]"]+ stokens2 + ["[SEP]"]

        input_ids = get_ids(stokens, tokenizer, max_seq_length)
        input_masks = get_masks(stokens, max_seq_length)
        input_segments = get_segments(stokens, max_seq_length)
        pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
        df_bert_output.loc[i,'title'] = s1 
        df_bert_output.loc[i, 1:] = pool_embs[0]

 5347 out of 5348

In [None]:
#for inspecting individual record -- testing
_rows = 1432
s1 = df_news.loc[_rows,'title']
s2 = df_news.loc[_rows,'description']
stokens1 = tokenizer.tokenize(s1)
stokens2 = tokenizer.tokenize(s2)
stokens = ["[CLS]"] + stokens1 + ["[SEP]"]+ stokens2 + ["[SEP]"]
input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)
pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

In [45]:
df_news[pd.isnull(df_news['title'])]

Unnamed: 0,author,title,description,url,publishedAt,content,from,date
1432,Lulu Yilun Chen,,"A Chinese e-commerce site called Pinduoduo, or...",https://www.bloomberg.com/news/articles/2018-0...,2018-06-30T06:08:00Z,"A Chinese e-commerce site called Pinduoduo, or...",Bloomberg,2018-06-30


In [29]:
print(stokens)
print(input_ids)
print(input_masks)
print(input_segments)

['[CLS]', 'ten', '##cent', 'music', 'explores', 'anticipated', 'huge', 'ip', '##o', 'for', 'its', 'streaming', 'music', 'service', '[SEP]', 'china', '’', 's', 'largest', 'music', 'streaming', 'company', ',', 'ten', '##cent', 'music', 'entertainment', 'group', ',', 'is', 'poised', 'to', 'create', 'an', 'initial', 'public', 'offering', '(', 'ip', '##o', ')', 'and', 'is', 'negotiating', 'with', 'several', 'banks', 'for', 'under', '##writing', '.', 'the', 'wall', 'street', 'journal', 'reported', 'that', 'the', 'successful', 'debut', 'of', 'spot', '##ify', 't', '…', '[SEP]']
[101, 2702, 13013, 2189, 15102, 11436, 4121, 12997, 2080, 2005, 2049, 11058, 2189, 2326, 102, 2859, 1521, 1055, 2922, 2189, 11058, 2194, 1010, 2702, 13013, 2189, 4024, 2177, 1010, 2003, 22303, 2000, 3443, 2019, 3988, 2270, 5378, 1006, 12997, 2080, 1007, 1998, 2003, 18875, 2007, 2195, 5085, 2005, 2104, 18560, 1012, 1996, 2813, 2395, 3485, 2988, 2008, 1996, 3144, 2834, 1997, 3962, 8757, 1056, 1529, 102, 0, 0, 0, 0, 0, 0, 