In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
from ast import literal_eval

In [2]:
path = r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v1'

col_names = ['Data Security', 'Do Not Track', 'First Party Collection/Use',
           'International and Specific Audiences', 'Policy Change',
           'Third Party Sharing/Collection', 'User Choice/Control', 
             'User Access, Edit and Deletion','Data Retention', 'Other']

'''
not used: 

First Party Collection/Use
Privacy practice describing data collection or data use by the company/organization owning the website or mobile app.

Third Party Sharing/Collection
Privacy practice describing data sharing with third parties or data collection by third parties. A third party is a company/organization other than the first party  company/organization that owns the website or mobile app.

User Choice/Control
Practice that describes general choices and control options available to users.

User Access, Edit and Deletion
Privacy practice that allows users to access, edit or delete the data that the company/organization has about them.

Data Retention
Privacy practice specifying the retention period for collected user information.

Data Security
Practice that describes how users’ information is secured and protected, e.g., from confidentiality, integrity, or availability breaches. Common practices include the encryption of stored data and online communications.

Policy Change
The company/organization’s practices concerning if and how users will be informed of changes to its privacy policy, including any choices offered to users.

Do Not Track
Practices that explain if and how Do Not Track signals (DNT) for online tracking and advertising are honored.

International and Specific Audiences
Specific audiences mentioned in the company/organization’s privacy policy, such as children or international users, for which the company/organization may provide special provisions.

Other
Another aspect not covered in the other categories is discussed in the text segment.

'''

In [3]:
def load(path):
    if not os.path.exists('./opp115.csv'):
        generate_dataset(path).to_csv('./opp115.csv', sep=',', index=False)

    return pd.read_csv('./opp115.csv', sep=',', header=0)

In [4]:
def load_policies(path):
    policies = []

    for f in glob(path+'/sanitized_policies/*.html'):
        with open(f, 'r') as policy:
            text = policy.read()
            segments = text.split('|||')

            p = pd.DataFrame(columns=['policy_id', 'segment_id', 'text'])
            p['segment_id'] = np.arange(len(segments))
            policy_url = f.split('\\')[-1].split('_')
            p['policy_id'] = policy_url[0]
            p['url'] = policy_url[1].replace('.html', '')
            p['text'] = segments

            policies.append(p)

    p = pd.concat(policies)
    p.reset_index(inplace=True, drop=True)
    
    return p

In [5]:
def load_annotations(path):        
    annotations = []

    for f in glob(path+'/annotations/*.csv'): 
        a = pd.read_csv(f, sep=',', header=None, names=['annotation_id', 'batch_id', 'annotator_id', 'policy_id', 'segment_id', 'data_practice', 'attributes', 'date', 'url'])
        a['policy_id'] = f.split('\\')[-1].split('_')[0]
        a.drop(['annotation_id', 'batch_id', 'date', 'url', 'attributes'], axis=1, inplace=True)
        annotations.append(a)

    a = pd.concat(annotations)
    a.reset_index(inplace=True, drop=True)
    
    return a

In [6]:
def generate_dataset(path):

    p = load_policies(path)
    a = load_annotations(path)
    
    merged = pd.merge(a, p, on=['policy_id', 'segment_id'], how='outer')
    merged = pd.concat([merged, pd.get_dummies(merged['data_practice'])], axis=1)
    merged = merged.groupby(['policy_id', 'segment_id', 'annotator_id']).max()
    merged = merged.groupby(['policy_id', 'segment_id']).max().reset_index()

    return merged

In [7]:
def attribute_counts(data):
    attributes = data['attributes'].to_list()
    counts = {}

    for a in attributes:
        d = literal_eval(a)

        for k, v in d.items():
            if not k in counts:
                counts[k] = {}
            elif not v['value'] in counts[k]:
                counts[k][v['value']] = 1
            else:
                counts[k][v['value']] += 1

    return counts

In [8]:
df = generate_dataset(path)

In [9]:
df

Unnamed: 0,policy_id,segment_id,data_practice,text,url,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Other,Policy Change,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,1017,0,Policy Change,Privacy Policy <br> <br> Sci-News.com is commi...,sci-news.com,0,0,0,0,0,1,1,0,0,0
1,1017,1,First Party Collection/Use,Information that Sci-News.com May Collect Onli...,sci-news.com,0,0,0,1,0,0,0,0,0,0
2,1017,2,First Party Collection/Use,"- if you contact us, we may keep a record of t...",sci-news.com,1,0,0,1,0,0,0,0,0,0
3,1017,3,First Party Collection/Use,- details of your visits to our site including...,sci-news.com,0,0,0,1,0,0,0,0,0,0
4,1017,4,Other,Sci-News.com does not knowingly collect or sol...,sci-news.com,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3787,995,2,Third Party Sharing/Collection,Information Collected at Mohegan Sun <br> <br>...,mohegansun.com,0,0,0,1,0,0,0,1,0,0
3788,995,3,"User Access, Edit and Deletion",Management of User Information <br> <br> Certa...,mohegansun.com,0,0,0,0,0,1,0,0,1,0
3789,995,4,International and Specific Audiences,Special Note About Children <br> <br> This sit...,mohegansun.com,0,0,0,1,1,0,0,0,0,0
3790,995,5,Other,If You Have a Question <br> <br> If you have a...,mohegansun.com,0,0,0,0,0,1,0,0,0,0


In [12]:
df.sum()

policy_id                               1017101710171017101710171017101710171017101710...
segment_id                                                                          84912
data_practice                           Policy ChangeFirst Party Collection/UseFirst P...
text                                    Privacy Policy <br> <br> Sci-News.com is commi...
url                                     sci-news.comsci-news.comsci-news.comsci-news.c...
Data Retention                                                                        156
Data Security                                                                         375
Do Not Track                                                                           32
First Party Collection/Use                                                           1522
International and Specific Audiences                                                  353
Other                                                                                1763
Policy Cha

In [None]:
for col in col_names:
    labels[col] = labels[col].apply(lambda x: 1 if x>1 else 0)