In [25]:
# Parameters
product = "notebooks/structure_by_1_kmeans_wo_hp.ipynb"


In [26]:
import __path__

In [27]:
from config import MAIN_EXPERIMENT


MAX_SCORE = MAIN_EXPERIMENT.nb_args[product]
KMEANS_HYPERPARAMS = MAIN_EXPERIMENT.kmeans_hyperparams
DESCRIPTOR = MAIN_EXPERIMENT.dataset.descriptor
PICTURES = MAIN_EXPERIMENT.pictures

In [28]:
import json

from src.utils import resolve_stats


KEYS = ['Таблица', 'Ненум. список', 'Нум. список', 'Заголовок']
ALLOWED = ['tables', 'unordered lists', 'ordered lists', 'headings']

with open(DESCRIPTOR, 'r', encoding='utf-8') as f:
    stats = json.load(f)

hashes = []
unique_stats = []
for s in stats:
    if s['policy_hash'] not in hashes and s['statistics'] is not None:
        hashes.append(s['policy_hash'])
        unique_stats.append(resolve_stats(s['statistics']))

print(DESCRIPTOR) 
unique_stats[:5]

/mnt/Source/kuznetsovmd/ppr-sanitization/resources/finalized/output.json


[{'length': 8472,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 6,
  'headings': 0,
  'paragraphs': 10},
 {'length': 2510,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 0,
  'headings': 0,
  'paragraphs': 2},
 {'length': 8493,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 0,
  'headings': 9,
  'paragraphs': 34},
 {'length': 12282,
  'tables': 0,
  'ordered lists': 2,
  'unordered lists': 12,
  'headings': 10,
  'paragraphs': 4},
 {'length': 12597,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 1,
  'headings': 10,
  'paragraphs': 67}]

In [29]:
import numpy as np


scores = np.zeros((len(unique_stats), 4), dtype='int')

for id, us in enumerate(unique_stats):
    for i, k in enumerate(ALLOWED):
        scores[id, i] = us[k]

scores = np.array([s for s in scores if sum(s) < MAX_SCORE])

In [30]:
from sklearn.cluster import KMeans


kmeans = KMeans(**KMEANS_HYPERPARAMS)

In [31]:
kmeans.fit(scores)

In [32]:
from collections import Counter


count = Counter(kmeans.labels_)

In [33]:
kmeans.cluster_centers_[:5]

array([[0.05226481, 0.21602787, 0.13937282, 5.48780488],
       [0.08536585, 9.17073171, 0.31707317, 8.17073171],
       [0.02867384, 0.43010753, 8.45519713, 0.39784946],
       [0.08424697, 0.16156953, 0.03981535, 1.        ],
       [0.025     , 0.25      , 1.45      , 2.5       ]])

In [34]:
from collections import Counter


sums = {i: sum(c) for i, c in enumerate(kmeans.cluster_centers_)}

count = Counter(kmeans.labels_)
widths = np.array([count[p] for p in range(kmeans.n_clusters)])
sorted_keys = list({k: v for k, v in sorted(sums.items(), key=lambda item: item[1])}.keys())
sorted_widths = [widths[k] for k in reversed(sorted_keys)]
sorted_clusters = np.array([kmeans.cluster_centers_[k] for k in reversed(sorted_keys)])

In [35]:
import plotly.graph_objects as go

from config import COLORS_D


fig = go.Figure()

for i, r in enumerate(sorted_clusters.transpose()):
    fig.add_trace(
        go.Bar(
            x=np.cumsum(sorted_widths)-sorted_widths, y=r,
            name=KEYS[i].capitalize(),
            width=sorted_widths,
            offset=0))

fig.update_layout(
    font=dict(family='Times New Roman', size=20, color='black'),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(pad=10),
    colorway=COLORS_D,
    barmode='stack',
    width=1600,
    height=900)

fig.update_xaxes(
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='Кластеры политик безопасности'),
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000', 
    showgrid=True, 
    tickangle=0,
    ticklen=10)

fig.update_yaxes(
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='Количество структурных элементов'),
    range=[0, max([sum(s) for s in scores])],
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000',
    showgrid=True, 
    tickangle=0, 
    ticklen=10)

fig

In [36]:
fig.write_image(f'{PICTURES}/structure_by_1_kmeans_wo_hp.png')