In [1]:
from bs4 import BeautifulSoup
import os

In [2]:
flist = [f for f in os.listdir('data/htm/') if '.htm' in f]

In [3]:
flist

['Apple_TOU.htm', 'Termly-Website-Terms-and-Conditions-Template.htm']

In [4]:
with open(os.path.join('data/htm/', flist[0])) as f:
    apple_soup = BeautifulSoup(f)

with open(os.path.join('data/htm/', flist[1])) as f:
    default_soup = BeautifulSoup(f)

In [5]:
def find_and_clean(soup):
    paras = soup.find('div', {'class':'WordSection1'}).find_all('p', {'class':'MsoNormal'})
    out_paras = [p.text.replace('\xa0', ' ').replace('\n', ' ') for p in paras]
    return [p for p in out_paras if p != ' ']

In [6]:
default_paras = find_and_clean(default_soup)
apple_paras = find_and_clean(apple_soup)

In [7]:
## implement similarities
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint

In [8]:
opp_flist = os.listdir("data/OPP/OPP-115/sanitized_policies")

In [9]:
import string
from nltk import word_tokenize
from nltk.corpus import stopwords



def find_and_clean_OPP(f, fdir = "data/OPP/OPP-115/sanitized_policies"):
    with open(os.path.join(fdir, f)) as j:
        soup = BeautifulSoup(j)
    paras = str(soup).split('<br/>')
    paras = [BeautifulSoup(p).text.replace("|||", '') for p in paras]
    paras = [p for p in paras if p != " "]
    paras = [word_tokenize(p) for p in paras]#[p.split(' ') for p in paras]
    exclude = set(string.punctuation)
    stop = set(stopwords.words('english'))
    ans = []
    translator = str.maketrans('','', string.punctuation)
    for clause in paras:
        s = []
        for word in clause:
            if word != "" and word not in stop:
                w = word.translate(translator).strip()
                if w!= '':
                    s.append(w.lower())
        ans.append(s)
    
            
    return ans

In [10]:
df= []
for i in opp_flist:
    df.extend(find_and_clean_OPP(i))

In [11]:
df2 = [a for a in df if len(a) >0]

In [12]:
from gensim.models import FastText

fast_embed = FastText(size = 300, window = 3, min_count = 1, sentences= df2, iter = 10)

In [13]:
fast_embed.most_similar(['privacy'])

  """Entry point for launching an IPython kernel.


[('viprivacy', 0.9854973554611206),
 ('privacyadmin', 0.9761562943458557),
 ('privacypolicy', 0.9581217169761658),
 ('bankingprivacy', 0.952069103717804),
 ('zipscenecomprivacy', 0.9443008899688721),
 ('privacycoordinator', 0.9410880208015442),
 ('wwwaddthiscomprivacy', 0.934051513671875),
 ('profileprivacy', 0.9302988052368164),
 ('wwwqriocitycomusenlegalprivacy', 0.9260797500610352),
 ('privacystatement', 0.9226903915405273)]

In [14]:
## construct sentence matrix
### (x, y) matrix x = word_embeddings_length, y = number of words; output (x dimension sentence vector)

def construct_mat(sent, model):
    return np.array([model[w] for w in sent])

In [15]:
## word to sentence embeddings 
### (x, y) matrix x = word_embeddings_length, y = number of words; output (x dimension sentence vector)
### According to https://arxiv.org/pdf/1805.09843.pdf; max pooling performs well in sentence comparisons
def SWEM_max(mat):
    return mat.max(axis = 0)
    

In [16]:
## similarity
## Given query vector (1, n) and response matrix (m, n), get most similar x vectors, lookup where they are
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def sim_query(query, response, query_text, response_text, top_n =5):
    total = np.vstack((query, response))
    print(total.shape)
    cos_sim = cosine_similarity(total)[0, :]
    print(cos_sim[:10])
    top_ind = cos_sim.T.argsort()[-(top_n+1):-1][::-1]
    print(top_ind)
    ans = []
    for j in top_ind.tolist():
        ans.append((cos_sim[j], response_text[j-1]))
    
    return (query_text, ans)

In [17]:
def corpus_mat(corpus, model):
    total_mat = np.empty((len(corpus), model.vector_size))
    for ind, sent in enumerate(corpus):
        sent_mat = construct_mat(sent, model)
        try:
            sent_vec = SWEM_max(sent_mat)
        except:
            print(ind)
            print(sent_mat)
        total_mat[ind, :] = sent_vec
    return total_mat

In [18]:
total_mat = corpus_mat(df2, fast_embed)

  """


In [19]:
q = total_mat[:200, :]
a = total_mat[200:, :]

In [20]:
sim_query(q[50], a, df2[50], df2[200:])

(5550, 300)
[1.         0.35501206 0.35765111 0.34865871 0.65293289 0.69575604
 0.37571999 0.74880933 0.88568868 0.90868697]
[1564 5220 1683 5444 2672]


(['in',
  'course',
  'serving',
  'advertisements',
  'site',
  'thirdparty',
  'advertiser',
  'may',
  'place',
  'recognize',
  'unique',
  'cookie',
  'browser'],
 [(0.9740945466071076,
   ['in',
    'course',
    'serving',
    'advertisements',
    'site',
    'thirdparty',
    'advertiser',
    'may',
    'place',
    'recognize',
    'unique',
    'cookie',
    'browser',
    'if',
    'would',
    'like',
    'information',
    'practice',
    'know',
    'choices',
    'information',
    'used',
    'company',
    'please',
    'click']),
  (0.9731735683035212,
   ['pbs',
    'uses',
    'content',
    'cookies',
    'deliver',
    'relevant',
    'local',
    'resources',
    'remember',
    'browser',
    'preferences',
    'improve',
    'visitors',
    'experiences',
    'site',
    'pbs',
    'sell',
    'information',
    'collected',
    'cookies',
    'use',
    'information',
    'commercerelated',
    'purposes',
    'in',
    'addition',
    'pbs',
    'filter',
 