<h1>Preparing Data</h1>

In [1]:
from collections import Counter
from scripts.groups_opp import groups
from scripts.lda import LDA
from scripts.functions import files
import os
import json


policies = {}
        
with open("resources/datasets/all_segments.json", "r", encoding="utf-8") as f:
    segments = json.load(f)
    
    ids = set([s["file_id"] for s in segments])
    policies = {id_: [] for id_ in ids}
    
    for s in segments:
        for p in s["paragraphs"]:
            policies[s["file_id"]].append(p)

In [2]:
import numpy as np

iot_tfidf = LDA(
    [par for policy in policies.values() for par in policy],
    freq="tf-idf", topics_count=15, saved="model_20220127_155118"
)

scores = [0 for _ in range(9)]

for id, policy in enumerate(policies.values()):
    
    for paragraph in policy:
        topics = iot_tfidf.get_document_topics(paragraph.lower().split(), minimum_probability=.5)

        for g in groups:
            for t in topics:

                if t[0] in g["topics"]:
                    scores[g["id"]] += 1
                    
aspects_sum = sum(scores)

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
iot_tfidf.print_topics()

[(0,
  '0.007*"communication" + 0.006*"receipt" + 0.006*"device" + 0.005*"party" + 0.005*"third" + 0.005*"email" + 0.005*"opt" + 0.004*"promotional" + 0.004*"personal" + 0.004*"instruction"'),
 (1,
  '0.009*"healthcare" + 0.005*"secure" + 0.005*"security" + 0.005*"social" + 0.005*"opt" + 0.005*"mail" + 0.005*"network" + 0.004*"data" + 0.004*"personal" + 0.004*"successor"'),
 (2,
  '0.008*"collect" + 0.006*"job" + 0.004*"identifiable" + 0.004*"site" + 0.004*"personally" + 0.004*"cancel" + 0.003*"service" + 0.003*"information" + 0.003*"passive" + 0.003*"use"'),
 (3,
  '0.005*"personal" + 0.004*"service" + 0.004*"health" + 0.004*"party" + 0.004*"conversation" + 0.004*"third" + 0.004*"poll" + 0.004*"promotion" + 0.003*"policy" + 0.003*"linked"'),
 (4,
  '0.008*"child" + 0.007*"email" + 0.006*"personal" + 0.006*"u" + 0.006*"service" + 0.006*"website" + 0.006*"collect" + 0.005*"user" + 0.005*"communication" + 0.005*"information"'),
 (5,
  '0.008*"cooky" + 0.008*"personal" + 0.007*"website" +

In [4]:
scores

[6, 27, 33, 304, 475, 904, 2, 87, 17]

In [11]:
import plotly.express as px
import plotly.graph_objects as go
from scripts.functions import resolve_group_name

fig = go.Figure()

fig.update_layout(
    font=dict(
        size=18,
        color="#000000",
    ),
    colorway=list(reversed([*px.colors.qualitative.Dark24[:5], *px.colors.qualitative.Dark24[7:11]])),
    showlegend=True,
    legend=dict(
        yanchor="top",
        xanchor="left",
        x=1,
        y=1
    ),
    barmode="group",
    width=1000,
    height=650
)

for i, v in reversed(list(enumerate(scores))):
    fig.add_bar(
        name=resolve_group_name(i, groups),
        y=[v / aspects_sum * 100],
    )

fig.update_xaxes(showgrid=False, title_text='Policies` aspects', showticklabels=False)
fig.update_yaxes(showgrid=True, title_text='Percent of aspect in corpus')

pass

In [12]:
fig.write_image("pictures/opp_ClusterizeBars.png")
fig