In [1]:
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub
import bert.tokenization as tokenization
import pandas as pd
import re
import spacy as spicy
import xlsxwriter



In [2]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1" #loads in model from tensorhub and creates a base bert layer from that module
bert_layer = hub.KerasLayer(module_url, trainable=False)
train=pd.read_csv(r"C:\Users\peter\Documents\GitHub\Privacy-Law-Technology-Project\train.csv") #loads in train and test data for BERT model
test=pd.read_csv(r"C:\Users\peter\Documents\GitHub\Privacy-Law-Technology-Project\test.csv")

nlp = spicy.load("en_core_web_sm") 

In [3]:
excel_data = pd.read_excel(r'C:\Users\peter\Documents\GitHub\Privacy-Law-Technology-Project\filteredDataWithPrivacyPolicies.xlsx', index_col=0, na_values=['NA'])
# print(excel_data.columns.ravel())\n
devs = excel_data["developer"].tolist()
policies = excel_data["Raw Privacy Policy"].tolist()
# print(nlp(policies[2]))\n",
flag = 0
chunkList = []
combinedList = []
for policy in policies:
    doc = nlp(policy)
    docList = list(doc.sents)
    count = 0
    #Takes in list of entire text and divides and stores chunks of 3 sentences. If we reach the end\n",
    while (count < len(docList)):
        if (count+2 < len(docList)):
            chunkList.append([devs[flag], str(docList[count]) + " " +  str(docList[count+1]) + " " + str(docList[count+2])])
            count = count + 3
        elif(count+1 < len(docList)):
            chunkList.append([devs[flag],str(docList[count]) + " " + str(docList[count+1])])
            count = count + 2
        elif(count < len(docList)):
            chunkList.append([devs[flag],str(docList[count])])
            count = count + 1
    flag= flag + 1
    # for elements in chunkList:
    #     print(elements)
    #     print("END OF CHUNK \n")

In [28]:
# df = pd.DataFrame(chunkList)
# writer = pd.ExcelWriter(r'C:\Users\peter\Documents\GitHub\Privacy-Law-Technology-Projecttest.xlsx', engine='xlsxwriter')
# df.to_excel(writer, sheet_name='welcome', index=False)
# writer.close()
chunkList=pd.read_csv(r"C:\Users\peter\Documents\GitHub\Privacy-Law-Technology-Project\chunkedText.csv") #writes segmented text to an excel, which was natively converted to a csv and read back in
dropList = [] #last minute pre processing to make text have less HTML in it
for i in range(len(chunkList)):
    if(len(chunkList.iloc[i][1]) < 40):
        dropList.append(i)
    chunkList.at[i, "text"] = re.sub('\.[^ ]*', '', chunkList.iloc[i][1])
    chunkList.at[i,"text"] = re.sub('<[^<]+?>', '', chunkList.iloc[i][1])
    chunkList.at[i,"text"] = re.sub('_[^_]+?_', '', chunkList.iloc[i][1])
    chunkList.at[i, "text"] = re.sub('{[^{]+?}', '', chunkList.iloc[i][1])
    chunkList.at[i, "text"] = re.sub('-[^-]+?-', '', chunkList.iloc[i][1])
    chunkList.at[i, "text"] = re.sub('\(([^\((]+?\))', '', chunkList.iloc[i][1])
   
    
    
chunkList.drop(dropList, axis=0, inplace=True) #clears out all html esque tags from text, as well as drops tags that state things such as "couldn't access, couldn't open link", it also removes text between dashes, and after periods

In [29]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #creates the base vocab for our MODEL, a function to lowercase all content for the model's sake, and creates a tokenizer to split up sentences
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [30]:
def bert_encode(texts, tokenizer, max_len=512): #function that takes in a text segment, adds in necessary BERT tags to each sentence i.e for beginning of and end of sentences, creates tokens within each sentence
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def build_model(bert_layer, max_len=512): #builds model based on inputs from pre built specifications built into BERT model
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids]) #creates various layers of model to improve accuracy and evaluate sentences repetedly
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(2, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [31]:

max_len = 200 #max number of characters to consider, creates the labels for the model (0,1) for meeting reqs of different sections of COPPA, encodes train and test input to BERT style
train_input = bert_encode(train.text.values, tokenizer, max_len=max_len)
test_input = bert_encode(chunkList.text.values, tokenizer, max_len=max_len)
train_labels =tf.keras.utils.to_categorical(train.label.astype('int32'), num_classes=2)

In [32]:

model = build_model(bert_layer, max_len=max_len)
model.summary() #creates model, allows for further retrainability within each layer of the model, as well as defines how many attempts will be made to create model and based on what criteria should it retrain 
#in our case, our model is being trained to fit validation set. 

for layer in model.layers[-5:]:
    layer.trainable = True

checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=7,
    callbacks=[checkpoint, earlystopping],
    batch_size=64,
    verbose=1
)



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 200)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 200)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 200)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 1024),       335141889   ['input_word_ids[0][0]',         
                                 (None, 200, 1024)]               'input_mask[0][0]',       

In [33]:

model.load_weights('model.h5') #runs model predictions over each part of text, returning two percentages
test_pred = model.predict(test_input)
chunkList['predicted_values'] = test_pred.tolist()



In [42]:
count = 0
for k in chunkList.predicted_values: #if model is at least 75 percent condifent in the class of text, quantifies it as compliant with one section of COPPA, other wise, if it isn't sure, it is classified as class 2
    if(abs(k[0] - k[1]) > .6):
        if(k[0] > k[1]):
            
            chunkList.at[count,"final_prediction"] = 0
        else:
            chunkList.at[count,"final_prediction"] = 1
    else:
        chunkList.at[count,"final_prediction"] = 2
    

    count = count + 1

numRight = 0
numTotal = len(chunkList)
# for i in range(len(test)):
#     if(test.iloc[i]["label"] == test.iloc[i]["final_prediction"]):
#         numRight = numRight + 1
#     elif(test.iloc[i]["label"] != 2 and test.iloc[i]["final_prediction"] ==2):
#         numRight = numRight + 1

# print(numRight/numTotal)


k is [0.8061490654945374, 0.19385091960430145]
k is [0.504787802696228, 0.4952121376991272]
k is [0.3813522458076477, 0.6186477541923523]
k is [0.2793233394622803, 0.720676600933075]
k is [0.3014300465583801, 0.6985700130462646]
k is [0.20474742352962494, 0.7952525615692139]
k is [0.20673029124736786, 0.793269693851471]
k is [0.2464180290699005, 0.7535820007324219]
k is [0.22721685469150543, 0.7727832198143005]
k is [0.3931705057621002, 0.6068294644355774]
k is [0.16723142564296722, 0.8327686190605164]
k is [0.27305668592453003, 0.72694331407547]
k is [0.1731913685798645, 0.8268085718154907]
k is [0.1907060444355011, 0.8092938661575317]
k is [0.1907060444355011, 0.8092938661575317]
k is [0.1731913685798645, 0.8268085718154907]
k is [0.1731913685798645, 0.8268085718154907]
k is [0.1907060444355011, 0.8092938661575317]
k is [0.1907060444355011, 0.8092938661575317]
k is [0.1731913685798645, 0.8268085718154907]
k is [0.1731913685798645, 0.8268085718154907]
k is [0.1907060444355011, 0.80929

TypeError: 'float' object is not subscriptable

In [38]:

#creates 4 regexes to test for 4 different compliances of section one of COPPA that we are analyzing, scanning for names, addresses, emails, and phone numbers


nameRegex = re.compile(r"[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+")


phoneNumberRegex = re.compile(r'\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}')

emailRegex = re.compile(r'\S+@\S+')

addressRegex = re.compile(r'\b\d{1,5} [a-zA-Z0-9 \-.,#&]*[a-zA-Z0-9]\b')

for j in range(len(chunkList)):
    if (len(nameRegex.findall(str(chunkList.iloc[j][1]))) != 0):
            chunkList.at[j,"has_name"] = 1
    else:
          chunkList.at[j,"has_name"] = 0
    if (len(phoneNumberRegex.findall(str(chunkList.iloc[j][1]))) != 0):
            chunkList.at[j,"has_phone"] = 1
    else:
          chunkList.at[j,"has_phone"] = 0
    if (len(emailRegex.findall(str(chunkList.iloc[j][1]))) != 0):
            chunkList.at[j,"has_email"] = 1
    else:
          chunkList.at[j,"has_email"] = 0
    if (len(addressRegex.findall(str(chunkList.iloc[j][1]))) != 0):
            chunkList.at[j,"has_address"] = 1
    else:
          chunkList.at[j,"has_address"] = 0


In [39]:
#for each company, determines whether it has at least occurence of text meeting that specification either meeting the classification of having text compliant with sections 2 or 3 of what we are looking at, 
#or the presence of at least one match from our regexes

section3compliance = chunkList[chunkList['final_prediction'] == 1].groupby('company').size()
numSect3Compliant = len(section3compliance)
sect3companies = section3compliance.index.tolist()

section2compliance = chunkList[chunkList['final_prediction'] == 0].groupby('company').size()
numSect2compliance = len(section2compliance)
sect2companies = section2compliance.index.tolist()

sect1namecompliance = chunkList[chunkList['has_name'] == 1].groupby('company').size()
numNamecompliance = len(sect1namecompliance)
sect1namecompanies = sect1namecompliance.index.tolist()

sect1emailcompliance = chunkList[chunkList['has_email'] == 1].groupby('company').size()
numEmailcompliance = len(sect1emailcompliance)
sect1emailcompanies = sect1emailcompliance.index.tolist()

sect1phonecompliance = chunkList[chunkList['has_phone'] == 1].groupby('company').size()
numPhonecompliance = len(sect1phonecompliance)
sect1phonecompanies = sect1phonecompliance.index.tolist()

sect1addresscompliance = chunkList[chunkList['has_address'] == 1].groupby('company').size()
numAddresscompliance = len(sect1addresscompliance)
sect1addresscompanies = sect1addresscompliance.index.tolist()


common_elements = list(
    set(sect3companies).intersection(sect2companies, sect1namecompanies,sect1emailcompanies, sect1phonecompanies, sect1addresscompanies)
)
print(common_elements)
print("Number of fully compliant companies:", len(common_elements))

print( "Number of Unique Companies: ", chunkList['company'].nunique())
print( "Number of Companies with Data Collector Names: ", numNamecompliance)
print( "Number of companies with Data Collector Email:", numEmailcompliance )
print("Number of Companies with Data Collector Phone Number: ", numPhonecompliance)
print("Number of Companies with Data Collector Address: ", numAddresscompliance)
print("Number of Companies who are compliant with section 2: ", numSect2compliance)
print("Number of Companies who are compliant with section 3: ", numSect3Compliant)



[]
Number of fully compliant companies: 0
Number of Unique Companies:  232
Number of Companies with Data Collector Names:  204
Number of companies with Data Collector Email: 137
Number of Companies with Data Collector Phone Number:  124
Number of Companies with Data Collector Address:  187
Number of Companies who are compliant with section 2:  0
Number of Companies who are compliant with section 3:  0


In [40]:

#testing 
import re

string = "A cookie is a Sarah Jessica Sebastian Parker string of text information Sebastian Rivera transferred from a website to your computer's hard drive so that Ben Lepsch the website can remember you. Cookies can help a website adapt content more quickly to your interests. Most importantly websites use cookies. In general, a cookie contains the name of the domain of origin of the cookie; the of the cookie and a value, namely a unique number created randomly."
regex = re.compile(r"[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+")
print(regex.findall(string))

phoneNumberRegex = re.compile(r'\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}')
string2 = "call me at Telephone +1 (630)-200-9399"
print(phoneNumberRegex.findall(string2))

emailRegex = re.compile(r'\S+@\S+')
string3 = "If If you have any questions or suggestions about our Privacy Policy, do not hesitate to contact us at kyle@7thbe.at contact us at the following email address.</span><br/> <a href=mailto:app_support@rvappstudios.com style= color:#38B9ECgame-polic >app_support@rvappstudios.com</a>_x000D_"
print(emailRegex.findall(string3))

addressRegex = re.compile(r'\b\d{1,5} [a-zA-Z0-9 \-.,#&]*[a-zA-Z0-9]\b')
string4 = "my address is Aastvej 1, 7190 Billund, Denmark  18 rue Barbès 92120 Montrouge (France) 707 North Dubuque St"
print(addressRegex.findall(string4))

['Sarah Jessica Sebastian', 'Sebastian Rivera', 'Ben Lepsch']
['+1 (630)-200-9399']
['kyle@7thbe.at', 'href=mailto:app_support@rvappstudios.com', '>app_support@rvappstudios.com</a>_x000D_']
['7190 Billund, Denmark  18 rue', '92120 Montrouge', '707 North Dubuque St']


In [41]:
print(chunkList[chunkList['final_prediction'] == 1].iloc[4][1])

IndexError: single positional indexer is out-of-bounds