<h1>Preparing Data</h1>

In [1]:
from collections import Counter
from scripts.groups_iot import groups
from scripts.lda import LDA
from scripts.functions import files


policies = []

fs = files("resources/datasets/plain_policies", r".*")
for f in fs:
    with open(f, "r", encoding="utf-8") as fl:
        policies.append([p for p in fl.read().split("\n") if len(p) >= 100])

In [2]:
policies[0]

['Thank you for visiting this website (the Website). Vornado Air LLC (the Company, we, or us) is committed to protecting your privacy. We strive to keep your Personal Information (as defined below) confidential.',
 'This Privacy Policy describes: (i) the customer information we collect, including Personal Information, how long we retain it and why; (ii) how we use information and Personal Information and when (and if) we share it with third parties; (iii) the choices you can make about how your Personal Information is collected, used and shared; (iv) how to correct, delete or transfer your Personal Information; and (v) the data security, accuracy and access measures we have adopted to protect Personal Information under our control from loss, misuse or alteration.',
 'BY USING THE WEBSITE, YOU ARE CONSENTING TO THIS PRIVACY POLICY. THIS PRIVACY POLICY IS INCORPORATED INTO AND MADE PART OF OUR {REMOVED HREF} TERMS OF SERVICE. PLEASE READ THE ENTIRE PRIVACY POLICY AND ALSO OUR TERMS OF SE

In [3]:
import numpy as np

iot_tfidf = LDA(
    [par for policy in policies for par in policy],
    freq="tf-idf", topics_count=15, saved="model_20220127_160914"
)

scores = [0 for _ in range(14)]

for policy in policies:
    
    for paragraph in policy:
        topics = iot_tfidf.get_document_topics(paragraph.lower().split(), minimum_probability=.7)

        for g in groups:
            for t in topics:

                if t[0] in g["topics"]:
                    scores[g["id"]] += 1
                    
aspects_sum = sum(scores)

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
scores

[56, 131, 4, 40, 16, 4, 12, 2, 65, 204, 1, 2, 60, 248]

In [5]:
keys = [g["name"] for g in groups]

In [6]:
keys

['Organization contact information',
 'Policy change',
 'Legal basis & Data protection',
 'First party collection: Opt-in, opt-out<br>for marketing purposes',
 'First party collection: tracking data (cookies)',
 'First party collection: Data about employees',
 'First party collection: Account information',
 'Special audience: California residents',
 'Special audience: Children',
 'First party collection: Generic information',
 'Third party sharing in case of<br>merge and acquisition',
 'First party collection: Other',
 'Data security',
 'Third party sharing: Advertising and analytics']

In [23]:
import plotly.express as px
import plotly.graph_objects as go
from scripts.functions import resolve_group_name

fig = go.Figure()

fig.update_layout(
    font=dict(
        size=18,
        color="#000000",
    ),
    colorway=list(reversed([*px.colors.qualitative.Dark24[:5], *px.colors.qualitative.Dark24[7:16]])),
    showlegend=True,
    legend=dict(
        yanchor="top",
        xanchor="left",
        x=1,
        y=1
    ),
    barmode="group",
    width=1000,
    height=650
)

for i, v in reversed(list(enumerate(scores))):
    fig.add_bar(
        name=resolve_group_name(i, groups),
        y=[v / aspects_sum * 100],
    )

fig.update_xaxes(showgrid=False, title_text='Policies` aspects', showticklabels=False)
fig.update_yaxes(showgrid=True, title_text='Percent of aspect in corpus')

pass

In [24]:
fig.write_image("pictures/iot_ClusterizeBars.png")
fig