In [9]:
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

## Ordered Sentence Embedding Methods

- Given that non-ordered approaches do not perform well, order sentence embeddings were employed 

#### Method 1: Univerasl sentence encoder 
- available through tensorhub https://tfhub.dev/google/universal-sentence-encoder/1
- Paper: Daniel Cer, Yinfei Yang, Sheng-yi Kong, Nan Hua, Nicole Limtiaco, Rhomni St. John, Noah Constant, Mario Guajardo-Cespedes, Steve Yuan, Chris Tar, Yun-Hsuan Sung, Brian Strope, Ray Kurzweil (2018) https://arxiv.org/abs/1803.11175 

#### Method 2: BERT sentence embedding (SBERT) 
- Available through setnence-transformers package https://www.sbert.net/docs/training/overview.html
- Paper: Nils Reimers, Iryna Gurevych (2019) https://arxiv.org/abs/1908.10084

In [52]:
import os
os.environ["TFHUB_CACHE_DIR"] = r'J:\Users\JJ\tf_hub'

import tensorflow as tf
import tensorflow_hub as hub
import hashlib

handle = "https://tfhub.dev/google/universal-sentence-encoder/4"
hashlib.sha1(handle.encode("utf8")).hexdigest()


embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
def get_sentence_embeddings(paragraph_array):
    embeddings=embed(paragraph_array)
    return embeddings

INFO:absl:Using J:\Users\JJ\tf_hub to cache modules.


In [50]:
def find_and_clean(soup):
    paras = soup.find('div', {'class':'WordSection1'}).find_all('p', {'class':'MsoNormal'})
    out_paras = [p.text.replace('\xa0', ' ').replace('\n', ' ') for p in paras]
    return [p for p in out_paras if p != ' ']


import string
from nltk import word_tokenize
from nltk.corpus import stopwords



def find_and_clean_OPP(f, fdir = "data/OPP/OPP-115/sanitized_policies"):
    with open(os.path.join(fdir, f)) as j:
        soup = BeautifulSoup(j)
    paras = str(soup).split('<br/>')
    paras = [BeautifulSoup(p).text.replace("|||", '') for p in paras]
    paras = [p for p in paras if p != " "]
    paras = [word_tokenize(p) for p in paras]#[p.split(' ') for p in paras]
    exclude = set(string.punctuation)
    stop = set(stopwords.words('english'))
    ans = []
    translator = str.maketrans('','', string.punctuation)
    for clause in paras:
        s = []
        for word in clause:
            if word != "" and word not in stop:
                w = word.translate(translator).strip()
                if w!= '':
                    s.append(w.lower())
        ans.append(s)
               
    return ans


## construct sentence matrix
### (x, y) matrix x = word_embeddings_length, y = number of words; output (x dimension sentence vector)

def construct_mat(sent, model):
    return np.array([model[w] for w in sent])


## word to sentence embeddings 
### (x, y) matrix x = word_embeddings_length, y = number of words; output (x dimension sentence vector)
### According to https://arxiv.org/pdf/1805.09843.pdf; max pooling performs well in sentence comparisons
def SWEM_max(mat):
    return mat.max(axis = 0)


def corpus_mat(corpus, model):
    total_mat = np.empty((len(corpus), model.vector_size))
    for ind, sent in enumerate(corpus):
        sent_mat = construct_mat(sent, model)
        try:
            sent_vec = SWEM_max(sent_mat)
        except:
            print(ind)
            print(sent_mat)
        total_mat[ind, :] = sent_vec
    return total_mat


## similarity
## Given query vector (1, n) and response matrix (m, n), get most similar x vectors, lookup where they are
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def sim_query(query, response, query_text, response_text, top_n =5):
    total = np.vstack((query, response))
    #print(total.shape)
    cos_sim = cosine_similarity(total)[0, :]
    #print(cos_sim[:10])
    top_ind = cos_sim.T.argsort()[-(top_n+1):-1][::-1]
    #print(top_ind)
    ans = []
    for j in top_ind.tolist():
        ans.append((cos_sim[j], response_text[j-1]))
    return (query_text, ans)

def print_simquery(simquery):
    print("#"*50)
    print("source: ")
    print(' '.join(simquery[0]))
    print("Top 5  matches: ")
    for j in simquery[1]:
        print('score: ', j[0])
        print(' '.join(j[1]))        
    print("#"*50)

In [12]:
from bs4 import BeautifulSoup
flist = [f for f in os.listdir('data/htm/') if '.htm' in f]
with open(os.path.join('data/htm/', flist[0])) as f:
    apple_soup = BeautifulSoup(f)

with open(os.path.join('data/htm/', flist[1])) as f:
    default_soup = BeautifulSoup(f)
    
    default_paras = find_and_clean(default_soup)
apple_paras = find_and_clean(apple_soup)

opp_flist = os.listdir("data/OPP/OPP-115/sanitized_policies")

df= []
for i in opp_flist:
    df.extend(find_and_clean_OPP(i))
    
df2 = [a for a in df if len(a) >0]

In [13]:
embed_imput = [' '.join(a) for a in df2]

In [53]:
allembeddings = get_sentence_embeddings(embed_imput)

In [54]:
q = allembeddings[:200 , :]
a = allembeddings[200:, :]

In [56]:
sq = sim_query(q[50], a, df2[50], df2[200:])

In [57]:
print_simquery(sq)

##################################################
source: 
in course serving advertisements site thirdparty advertiser may place recognize unique cookie browser
Top 5  matches: 
score:  0.7299808
in course serving advertisements site thirdparty advertiser may place recognize unique cookie browser if would like information practice know choices information used company please click
score:  0.7041924
these companies may place recognize cookies web beacons technology track certain nonpersonal information website users for example course serving certain advertisements advertiser may place recognize unique cookie browser order collect certain information use nyt services for another example advertiser ad server may also able collect device s unique identifier course serving ad in many cases information could used show ads websites based interests
score:  0.6373111
some advertisers tulsaworldcom may use cookies when click advertisement visit advertiser s web site make purchase advertiser co

## Compare with BERT

In [15]:
## bert
import os
os.environ["TFHUB_CACHE_DIR"] = r'J:\Users\JJ\tf_hub'

import tensorflow as tf
import tensorflow_hub as hub
import hashlib

BERThandle = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
hashlib.sha1(BERThandle.encode("utf8")).hexdigest()


BERTembed = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2")

def get_BERT_embeddings(paragraph_array, array_len = 512):
    embeddings=embed(paragraph_array)
    return embeddings

In [5]:
## Sentence-transformers: A BERT-based implementation of sentence embeddings
from sentence_transformers import SentenceTransformer, util

In [3]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

100%|███████████████████████████████████████████████████████████████████████████████| 245M/245M [00:42<00:00, 5.73MB/s]


In [14]:
embed_imput[:5]

['privacy policy',
 'scinewscom committed protecting respecting privacy to better inform policy concerning user privacy adopted following terms please note terms subject change changes included page',
 'information scinewscom may collect online',
 'scinewscom may collect process following data',
 'information provide filling forms site including names email website addresses may also ask information purposes example report problem site']

In [20]:
#Encode all sentences
embeddings = model.encode(embed_imput)

#Compute cosine similarity between all pairs
cos_sim = util.pytorch_cos_sim(embeddings[:200], embeddings[200:])

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])



In [32]:
np.array(cos_sim[0].argsort()[-5:])[::-1]

array([1576, 1750, 4863, 1081, 1818], dtype=int64)

In [63]:
np.array(cos_sim)[0][top_5]

array([1.0000001, 1.0000001, 1.0000001, 1.0000001, 1.0000001],
      dtype=float32)

In [64]:
## top 5 from each
top_5_for_each = []
for i in range(len(cos_sim)):
    top_5 = np.array(cos_sim[i].argsort()[-5:])[::-1]
    top_5_for_each.append((np.array(cos_sim)[i][top_5], top_5))

In [65]:
top_5_for_each[2]

(array([0.89077115, 0.8849205 , 0.88405174, 0.86957514, 0.85734034],
       dtype=float32),
 array([3142,  694, 5537, 3844,  675], dtype=int64))

In [66]:
for k in range(len(top_5_for_each)):
    print("#"*50)
    print("source: ")
    print(embed_imput[k])
    print("Top 5  matches: ")
    for j in range(len(top_5_for_each[k][0])):
        print('Score: ', top_5_for_each[k][0][j])
        print(embed_imput[200+top_5_for_each[k][1][j]])
        
    print("#"*50)

##################################################
source: 
privacy policy
Top 5  matches: 
Score:  1.0000001
privacy policy
Score:  1.0000001
privacy policy
Score:  1.0000001
privacy policy
Score:  1.0000001
privacy policy
Score:  1.0000001
privacy policy
##################################################
##################################################
source: 
scinewscom committed protecting respecting privacy to better inform policy concerning user privacy adopted following terms please note terms subject change changes included page
Top 5  matches: 
Score:  0.89361596
we occasionally update privacy policy necessary protect users comply changing environment we recommend check posted privacy policy visit sites sure aware understand current policy your continued use respective site means accept updated privacy policy
Score:  0.8892149
if modify privacy policy handle personal information post revised information revised effective date top privacy policy if material changes privacy p

##################################################
source: 
it necessary respond lawful governmental requests legal process example court order search warrant subpoena the information relevant crime committed an emergency exists poses threat safety safety another person it necessary protect rights property aol other parties connection certain business transactions in event ownership aol inc affiliate assets changes result merger acquisition sale assets unlikely event bankruptcy information may transferred another company if believe transfer results material change use information ve collected received given opportunity opt transfer
Top 5  matches: 
Score:  0.8931799
we may required share personal nonpersonal information pursuant judicial governmental subpoenas warrants orders if required course obey law in addition notwithstanding term contrary privacy policy reserve right use disclose share personal nonpersonal information order investigate prevent take action regarding illegal activi

In [67]:
for k in range(50,51):
    print("#"*50)
    print("source: ")
    print(embed_imput[k])
    print("Top 5  matches: ")
    for j in range(len(top_5_for_each[k][0])):
        print('Score: ', top_5_for_each[k][0][j])
        print(embed_imput[200+top_5_for_each[k][1][j]])
        
    print("#"*50)

##################################################
source: 
in course serving advertisements site thirdparty advertiser may place recognize unique cookie browser
Top 5  matches: 
Score:  0.9763496
in course serving advertisements site thirdparty advertiser may place recognize unique cookie browser if would like information practice know choices information used company please click
Score:  0.9046395
we may work thirdparty advertising companies may use information visit site order provide advertisements goods services may interest site thirdparty web sites from site companies may place recognize unique cookie computer use technologies web beacons pixel tags our privacy policy cover use information thirdparty advertisement serving company may collect for information ad cookies companies provide users ability optout collection information cookies please click
Score:  0.89745224
you also aware web advertising partners may place recognize unique cookie browser cookies enable advertisers det