In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import math
import bert

In [2]:
FullTokenizer = bert.bert_tokenization.FullTokenizer

max_seq_length = 512

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)

In [3]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [5]:
def get_ids(tokens, tokenizer, max_seq_length):
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length - len(token_ids))
    return input_ids

In [7]:
df = pd.read_csv('punctuation-small-train.csv')

extracted = []

for index, row in df.iterrows():
    stokens = tokenizer.tokenize(row['text'])
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    
    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    
    extracted.append([input_ids, row['deceptive']])

df = pd.DataFrame(extracted, columns=['text', 'deceptive'])

df.to_csv('punctuation-small-train-extracted.csv', index=False)

In [8]:
df = pd.read_csv('punctuation-small-test.csv')

extracted = []

for index, row in df.iterrows():
    stokens = tokenizer.tokenize(row['text'])
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    
    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    
    extracted.append([input_ids, row['deceptive']])

df = pd.DataFrame(extracted, columns=['text', 'deceptive'])

df.to_csv('punctuation-small-test-extracted.csv', index=False)