In [1]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import bert.tokenization as tokenization
import pandas as pd

In [2]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=False)
train=pd.read_csv(r"C:\Users\peter\Documents\GitHub\Privacy-Law-Technology-Project\train.csv")
test=pd.read_csv(r"C:\Users\peter\Documents\GitHub\Privacy-Law-Technology-Project\test.csv")

In [3]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [4]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(2, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [5]:

max_len = 150
train_input = bert_encode(train.text.values, tokenizer, max_len=max_len)
test_input = bert_encode(test.text.values, tokenizer, max_len=max_len)
train_labels =tf.keras.utils.to_categorical(train.label.astype('int32'), num_classes=2)

In [9]:

model = build_model(bert_layer, max_len=max_len)
model.summary()

for layer in model.layers[-5:]:
    layer.trainable = True

checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=7,
    callbacks=[checkpoint, earlystopping],
    batch_size=64,
    verbose=1
)



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 150)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 150)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 1024),       335141889   ['input_word_ids[0][0]',         
                                 (None, 150, 1024)]               'input_mask[0][0]',       

In [10]:

model.load_weights('model.h5')
test_pred = model.predict(test_input)

test['predicted_values'] = test_pred.tolist()



In [11]:
print(test)

            company_name                                               text   
0   Montessori Preschool  EDOKI ACADEMY only collects the data necessary...  \
1   Montessori Preschool  To request the deletion of your Personal Data ...   
2   Montessori Preschool  A cookie is a string of text information trans...   
3                   LEGO  Request erasure You have the right to request ...   
4                   LEGO  Personal data is any information about a perso...   
5                   LEGO  If we change the way we handle your personal d...   
6            Fox & Sheep  Fox & Sheep is a “data controller” as defined ...   
7            Fox & Sheep  Insofar as your personal data is processed bas...   
8            Fox & Sheep  We do not knowingly collect personal informati...   
9      The Fennec Studio  We will collect and use of personal informatio...   
10     The Fennec Studio  his Privacy Policy governs the manner in which...   
11     The Fennec Studio  We are committed to conduc

In [27]:
count = 0
for k in test.predicted_values:
    if(abs(k[0] - k[1]) > .25):
        if(k[0] > k[1]):
            
            test.at[count,"final_prediction"] = 0
        else:
            test.at[count,"final_prediction"] = 1
    else:
        test.at[count,"final_prediction"] = 2
    count = count + 1

numRight = 0
numTotal = len(test)
for i in range(len(test)):
    if(test.iloc[i]["label"] == test.iloc[i]["final_prediction"]):
        numRight = numRight + 1

print(numRight/numTotal)


0.42857142857142855


In [29]:
print(test["label"])
print(test["final_prediction"])

0     0.0
1     1.0
2     NaN
3     1.0
4     0.0
5     NaN
6     NaN
7     1.0
8     0.0
9     0.0
10    NaN
11    NaN
12    0.0
13    NaN
14    1.0
15    0.0
16    1.0
17    NaN
18    0.0
19    NaN
20    1.0
Name: label, dtype: float64
0     0
1     1
2     2
3     1
4     2
5     1
6     0
7     1
8     2
9     2
10    2
11    0
12    0
13    2
14    0
15    0
16    1
17    2
18    0
19    2
20    1
Name: final_prediction, dtype: int64
