#### Mount the google drive to import the model configuration

In [1]:
# to mount the google drive
def mountDrive(force_remount=True):
    from google.colab import drive
    print('drive_filepath="drive/My Drive/"')
    return drive.mount('/content/drive', force_remount=force_remount)

mountDrive()

drive_filepath="drive/My Drive/"
Mounted at /content/drive


#### Installing and Importing libraries

In [2]:
%%capture

#installing the required libraries

!pip install selenium
!pip install lxml
!apt-get update 
!apt install chromium-chromedriver
!pip install nltk
!pip install transformers
!pip install torch
!pip install datasets

In [3]:
%%capture
#installing the required libraries

#following libraries are mainly used for web scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

#following libraries are mainly used for text preprocessing
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#following libraries are mainly used for text classification
import transformers
import torch
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk
from transformers import TFAutoModelForSequenceClassification

#following libraries are mainly used for cosine similarity calculation
from nltk.cluster.util import cosine_distance
from nltk import sent_tokenize
import networkx as nx

In [4]:
%%capture

#to calculate execution time of each cell
!pip install ipython-autotime
%load_ext autotime

time: 645 µs (started: 2022-11-20 04:46:37 +00:00)


#### Scraping the privacy policy

In [None]:
# to scrape the policy using headless browser and beautiful soup

def scrapeUrl(url):

  #setting the configuration of the headless browser 
  options = Options()
  options.add_argument("--window-size=1920,1200")
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')

  #to avoid loading images
  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
  
  #importing the browser and scraping the webpage
  driver = webdriver.Chrome('chromedriver',chrome_options=options)
  driver.get(url) 

  #parcing the scraped data into soup
  soup = BeautifulSoup(driver.page_source, 'lxml') 

  #to store the raw data as list of segments
  rawPageData = []

  #to find all the content under <p> and <li> tags
  for data in soup.find_all(['p','li']): 
    rawPageData.append(data.get_text())
  
  return rawPageData 

#### Pre-processing the scraped data

In [None]:
'''PriBERT accepts complete sentences therfore minimum pre-process is required.
   The following function removed duplicates and sentences with less than two words.'''

def minPreProcess(inputData):
  data = []
  [data.append(x) for x in inputData if x not in data]

  clData = []
  for element in data:
    if element.strip().count(" ") > 2:
      clData.append(element)
  
  return clData

#### Text Classification

In [None]:
'''To store the output of PrivBERT as a dictionary. 
   The key represents the privacy labels and value is a string of classified segments'''

def storeAsDict(result):
  
  #mapping of ids to privacy labels
  id2Label = {0: 'Data Retention', 1: 'Data Security', 2: 'Do Not Track', 3: 'First Party Collection/Use',
                     4: 'International and Specific Audiences', 5: 'Introductory/Generic', 6: 'Policy Change',
                     7: 'Practice not covered', 8:'Privacy contact information', 9: 'Third Party Sharing/Collection',
                     10: 'User Access, Edit and Deletion', 11: 'User Choice/Control'}
  
  #to store the output
  joinedResult= dict()

  for key in result.keys():
    temp = '.'.join(map(str, result[key]))
    print(temp)
    joinedResult[id2Label[key]] = temp
    
  return joinedResult

In [None]:
#A dummy customization unit with four user types and each containing 5 ids of privacy aspects

def customizer(select):
  userOptions = dict()
  userOptions["old male"] = [6,3,9,11,1]
  userOptions["young male"] = [7,1,4,5,10]
  userOptions["young female"] = [7,6,1,4,8]
  userOptions["old female"] = [9,3,2,11,1]

  return userOptions[select]

In [None]:
#Main function to perform the classification of the privacy policy

def classificationModel(datatoClassify, userType):
  # retreive the saved model 
  
  #path to the saved configuration of the trained PrivBERT
  modelDir = '/content/drive/MyDrive/Colab Notebooks/models/PolicyInterpreterFullSample'
  
  #no of labes is equal to the number of privacy aspects for which the model was trained
  numLabels = 12
    
  #to load the configuration of the trained PrivBERT
  checkpoint = "mukund/privbert"
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  # For cleaner label outputs
  id2Label = {0: 'Data Retention', 1: 'Data Security', 2: 'Do Not Track', 3: 'First Party Collection/Use',
              4: 'International and Specific Audiences', 5: 'Introductory/Generic', 6: 'Policy Change',
              7: 'Practice not covered', 8:'Privacy contact information', 9: 'Third Party Sharing/Collection',
              10: 'User Access, Edit and Deletion', 11: 'User Choice/Control'}
             
  #saving the index of each label
  label2id = {val: key for key, val in id2Label.items()}

  '''the sequence classification head of the tranformer library is used to import the PrivBERT model 
    with mappings of the label'''
  model = TFAutoModelForSequenceClassification.from_pretrained(
      modelDir, num_labels=numLabels, id2label=id2Label, label2id=label2id, local_files_only=True)
  
  #to provide customization of the privacy policy
  user = customizer(userType)
  
  #to save the output
  result = dict()
  
  for i in range(len(datatoClassify)) :

    tokenized = tokenizer(str(datatoClassify[i]), return_tensors="np", padding="longest", truncation= True)
    
    ''' it saves the tokenized form of the model output and takes the maximum value in the matirx along axis one, 
        which is equal to the output label''' 
    outputs = model(tokenized).logits
    classificationsEncoded = np.argmax(outputs, axis=1)

    #to convert the id of the label to its string privacy aspect
    classifications = [model.config.id2label[output] for output in classificationsEncoded]
    
    # only the aspects corresponding to the users privacy aspects are saved 
    if int(classificationsEncoded) in user:
      
      if int(classificationsEncoded) not in result.keys():
       result[int(classificationsEncoded)]= []
      
      result[int(classificationsEncoded)].append(datatoClassify[i])
  
  return result 

#### Segment Summarization

In [None]:
# To split the input into sentences and then tokenize into separate words

def segmentToTokens(segments):
  temp2 = [word_tokenize(t) for t in sent_tokenize(segments)]
  return temp2

In [None]:
# to calculate the frequency of each word in a segment, vectorise it and then find the cosine distance between pairs of vectories sentences.
def sentenceSimilarity(s1, s2, stopWords=None):

  #to avoid counting the stop words
  if stopWords is None:
    stopWords = []
  s1 = [wd.lower() for wd in s1]    
  s2 = [wd.lower() for wd in s2]     
  
  wordList = list(set(s1 + s2))    
  vector1 = [0] * len(wordList)    
  vector2 = [0] * len(wordList)     
  
  #to build the vector for the first sentence   
  for word in s1:
    if word in stopWords:
      continue        
    vector1[wordList.index(word)] += 1     

  # to  build the vector for the second sentence    
  for word in s2:        
    if word in stopWords:           
      continue        
    vector2[wordList.index(word)] += 1     
  
  return 1 - cosine_distance(vector1,  vector2)

In [None]:
#to generate a matrix containing similarity scores

def similarityMatrix(sentences, stopWords):
  similarityMtrx = np.zeros((len(sentences), len(sentences)))

  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if not i == j:
        similarityMtrx[i][j] = sentenceSimilarity(sentences[i], sentences[j], stopWords)
 
  return similarityMtrx

In [None]:
#main summarization function

def summarizer(sentences, length):
  #defines the list of stop words to be used
  stopWords = stopwords.words('english')
  #a list to save the summarized sentences
  summarizedText = []
  
  similarityMtrx = similarityMatrix(sentences, stopWords)
  
  #changing the two dimensional matrix to an array
  similarityMaxArray = nx.from_numpy_array(similarityMtrx)
  
  # to store the sentences with max similariyt scores form the pair of sentences
  scores = nx.pagerank(similarityMaxArray)
  
  # to arrange to scores in descending order
  ranks = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
  
  #uncoment to view the similary scores of the selected sentences
  #print("Indexes of top rank order are ", ranks)
  for i in range(length):
    if ranks[i][1] != ranks[(i+1)][1]:
      summarizedText.append(" ".join(ranks[i][1]))
  
  #to combine the list of sentences into a string
  summary = " ".join(summarizedText)
  
  return summary

In [None]:
#to display the final output
def displayOutput(opt, user):
  
  id2Label = {0: 'Data Retention', 1: 'Data Security', 2: 'Do Not Track', 3: 'First Party Collection/Use',
              4: 'International and Specific Audiences', 5: 'Introductory/Generic', 6: 'Policy Change',
              7: 'Practice not covered', 8:'Privacy contact information', 9: 'Third Party Sharing/Collection',
              10: 'User Access, Edit and Deletion', 11: 'User Choice/Control'}

  print(user.upper() + "\n")

  userType = customizer(user)

  for key in opt.keys():
    print(key)
    sentences =' '.join(opt[key].split())
    sentences = sent_tokenize(sentences)
    for s in sentences:
      print(s)
    print("\n")
  
  for k1 in userType:
    if id2Label[k1] not in opt.keys():
      print(id2Label[k1] + "\nNo data present.\n")



In [None]:
#to count the number of words in raw scraped data of privacy policy and the final summary of the privacy policy
def countWords(stage, content):
  count = 0
  if stage == "ini":
    for i in content:
      count = count + len(i.split())
  else :  
    for key in content.keys():
      count = count + len(opt[key].split())
  
  return count

#### Main

In [None]:
##Step - 1

#please input a user type  among : "young male", "old male", "young female", "old female"
user = "young female"

#please enter the url of the privacy policy. Few example urls are commented below

#url = "https://huggingface.co/privacy"
#url = "https://stackoverflow.com/legal/privacy-policy"
#url = "https://policies.google.com/privacy?"
#url = "https://www.linkedin.com/legal/privacy-policy"  
#url = "https://meta.wikimedia.org/wiki/Privacy_policy"
#url = "https://www.apple.com/legal/privacy/en-ww/" 
#url = "https://www.samsung.com/au/info/privacy/"
#url = "https://legal.yahoo.com/au/en/yahoo/privacy/products/searchservices/index.html"
url = "https://docs.github.com/en/site-policy/privacy-policies/github-privacy-statement"

In [None]:
##Step - 2

#to scrape the url

rawData = scrapeUrl(url)

In [None]:
##Step - 3

#to preprocess the scraped data
dataForClassification = minPreProcess(rawData)

In [None]:
##Step - 4

#to classify the privacy policy and saved it as dictionary
dataAfterClassification = dict()
dataAfterClassification = classificationModel(dataForClassification, user)

In [None]:
##Step - 5

#to store the classified segments as dictionary

joinedData = storeAsDict(dataAfterClassification)

In [None]:
joinedData.keys()

In [None]:
joinedData['Data Security']

In [None]:
##Step - 6

# to summarize the classified segments

opt = dict()
for key in joinedData.keys():
  seg = segmentToTokens(joinedData[key])
  if len(seg) > 5:
    opt[key] = summarizer(seg,5)
  else :
    opt[key] = joinedData[key]


In [None]:
##Step - 7

# to display the final summary

displayOutput(opt, user)

In [None]:
##Step - 8

# to display the number of words and compression ratio

initialCount = countWords("ini", rawData)
finalCount = countWords("", opt)
print("Initial count = " + str(initialCount))
print("\nFinal count = " + str(finalCount))
print("\nCompression ration = {cr:0.4f}".format(cr=finalCount/initialCount))