In [1]:

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk
import os
import json

from scripts.groups_opp import groups
from scripts.lda import LDA
from scripts.functions import files


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

clusters = 25
policies = {}

# fs = files("resources/datasets/plain_policies", r".*")
# for f in fs:
#     with open(f, "r", encoding="utf-8") as fl:
#         policies.extend([p for p in fl.read().split("\n") if len(p) >= 50])

with open("resources/datasets/all_segments.json", "r", encoding="utf-8") as f:
    segments = json.load(f)
    
    ids = set([s["file_id"] for s in segments])
    policies = {id_: [] for id_ in ids}
    
    for s in segments:
        for p in s["paragraphs"]:
            policies[s["file_id"]].append(p)


[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
policies.values()



In [3]:
iot_tfidf = LDA(
    [par for policy in policies.values() for par in policy], 
    freq="tf-idf", topics_count=15, saved="model_20220127_155118"
)

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
iot_tfidf.print_topics()

[(0,
  '0.007*"communication" + 0.006*"receipt" + 0.006*"device" + 0.005*"party" + 0.005*"third" + 0.005*"email" + 0.005*"opt" + 0.004*"promotional" + 0.004*"personal" + 0.004*"instruction"'),
 (1,
  '0.009*"healthcare" + 0.005*"secure" + 0.005*"security" + 0.005*"social" + 0.005*"opt" + 0.005*"mail" + 0.005*"network" + 0.004*"data" + 0.004*"personal" + 0.004*"successor"'),
 (2,
  '0.008*"collect" + 0.006*"job" + 0.004*"identifiable" + 0.004*"site" + 0.004*"personally" + 0.004*"cancel" + 0.003*"service" + 0.003*"information" + 0.003*"passive" + 0.003*"use"'),
 (3,
  '0.005*"personal" + 0.004*"service" + 0.004*"health" + 0.004*"party" + 0.004*"conversation" + 0.004*"third" + 0.004*"poll" + 0.004*"promotion" + 0.003*"policy" + 0.003*"linked"'),
 (4,
  '0.008*"child" + 0.007*"email" + 0.006*"personal" + 0.006*"u" + 0.006*"service" + 0.006*"website" + 0.006*"collect" + 0.005*"user" + 0.005*"communication" + 0.005*"information"'),
 (5,
  '0.008*"cooky" + 0.008*"personal" + 0.007*"website" +

In [5]:

scores = np.zeros((len(policies.keys()), 9), dtype="int")

for id, policy in enumerate(policies.values()):
    
    for paragraph in policy:
        topics = iot_tfidf.get_document_topics(paragraph.lower().split(), minimum_probability=.5)

        for g in groups:
            for t in topics:

                if t[0] in g["topics"]:
                    scores[id, g["id"]] += 1

In [6]:
scores

array([[0, 0, 0, ..., 0, 3, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 3, 1],
       ...,
       [0, 0, 0, ..., 0, 2, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
kmeans = KMeans(
    init="random",
    n_clusters=clusters,
    max_iter=500,
    random_state=42
)

In [8]:
kmeans.fit(scores)

KMeans(init='random', max_iter=500, n_clusters=25, random_state=42)

In [9]:
kmeans.cluster_centers_[:5]

array([[ 0.00000000e+00,  6.00000000e-01,  2.00000000e-01,
         8.60000000e+00,  6.40000000e+00,  1.16000000e+01,
         0.00000000e+00,  1.20000000e+00,  2.00000000e-01],
       [ 0.00000000e+00,  2.77555756e-17,  0.00000000e+00,
         2.00000000e+00,  7.33333333e+00,  6.66666667e+00,
         0.00000000e+00,  1.66666667e+00,  0.00000000e+00],
       [ 1.00000000e+00,  1.33333333e+00,  1.33333333e+00,
         7.33333333e+00,  1.40000000e+01,  2.43333333e+01,
         3.33333333e-01,  2.66666667e+00,  1.00000000e+00],
       [ 5.00000000e-01,  1.00000000e+00,  5.00000000e-01,
         3.50000000e+00,  1.85000000e+01,  1.10000000e+01,
         0.00000000e+00,  1.00000000e+00,  5.00000000e-01],
       [ 0.00000000e+00, -2.77555756e-17,  0.00000000e+00,
         2.20000000e+00,  8.80000000e+00,  1.24000000e+01,
         0.00000000e+00,  8.00000000e-01,  0.00000000e+00]])

In [10]:
from collections import Counter

sums = {i: sum(c) for i, c in enumerate(kmeans.cluster_centers_)}

count = Counter(kmeans.labels_)
widths = np.array([count[p] for p in range(clusters)])

sorted_keys = list({k: v for k, v in sorted(sums.items(), key=lambda item: item[1])}.keys())

sorted_widths = [widths[k] for k in reversed(sorted_keys)]
sorted_clusters = np.array([kmeans.cluster_centers_[k] for k in reversed(sorted_keys)])

In [11]:
import plotly.express as px
import plotly.graph_objects as go
from scripts.functions import resolve_group_name

fig = go.Figure()

for id, r in enumerate(sorted_clusters.transpose()):
    fig.add_trace(go.Bar(
        name=resolve_group_name(id, groups),
        y=r,
        x=np.cumsum(sorted_widths)-sorted_widths,
        width=sorted_widths,
        offset=0,
    ))

fig.update_layout(
    font=dict(
        size=20,
        color="#000000",
    ),
    barmode="stack",
    colorway=[*px.colors.qualitative.Dark24[:5], *px.colors.qualitative.Dark24[7:]],
    width=1600,
    height=900
)

fig.update_xaxes(showgrid=True, range=[0,115], title_text='Policies` documents')
fig.update_yaxes(showgrid=True, range=[1,55], title_text='Documents` paragraphs')

pass

In [12]:
fig.write_image("pictures/opp_ClusterizeBy1KMeans.png")
fig