In [24]:
# Parameters
product = "notebooks/structure_by_1_kmeans.ipynb"


In [25]:
import __path__

In [26]:
from config import MAIN_EXPERIMENT


MAX_SCORE = MAIN_EXPERIMENT.nb_args[product]
KMEANS_HYPERPARAMS = MAIN_EXPERIMENT.kmeans_hyperparams
DESCRIPTOR = MAIN_EXPERIMENT.dataset.descriptor
PICTURES = MAIN_EXPERIMENT.pictures

In [27]:
import json

from src.utils import resolve_stats


KEYS = ['Таблица', 'Ненум. список', 'Нум. список', 'Заголовок', 'Параграф']
ALLOWED = ['tables', 'unordered lists', 'ordered lists', 'headings', 'paragraphs']


with open(DESCRIPTOR, 'r', encoding='utf-8') as f:
    stats = json.load(f)

hashes = []
unique_stats = []
for s in stats:
    if s['policy_hash'] not in hashes and s['statistics'] is not None:
        hashes.append(s['policy_hash'])
        unique_stats.append(resolve_stats(s['statistics']))
        
unique_stats[:2]

[{'length': 8472,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 6,
  'headings': 0,
  'paragraphs': 10},
 {'length': 2510,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 0,
  'headings': 0,
  'paragraphs': 2}]

In [28]:
import numpy as np


scores = np.zeros((len(unique_stats), 5), dtype='int')

for id, us in enumerate(unique_stats):
    for i, k in enumerate(ALLOWED):
        scores[id, i] = us[k]

scores = np.array([s for s in scores if sum(s) < MAX_SCORE])

In [29]:
from sklearn.cluster import KMeans


kmeans = KMeans(**KMEANS_HYPERPARAMS)

In [30]:
kmeans.fit(scores)

In [31]:
from collections import Counter


count = Counter(kmeans.labels_)

In [32]:
kmeans.cluster_centers_[:5]

array([[3.84328358e-01, 1.20522388e+00, 3.91791045e-01, 6.00746269e-01,
        4.16753731e+01],
       [1.66666667e-01, 1.03333333e+00, 1.10606061e+00, 6.60606061e-01,
        7.01212121e+01],
       [1.15853659e-01, 4.75609756e-01, 8.84146341e+00, 9.15243902e+00,
        3.15853659e+00],
       [0.00000000e+00, 1.25000000e-01, 7.50000000e-01, 6.46250000e+01,
        1.06250000e+01],
       [1.25000000e-01, 6.60000000e+00, 8.25000000e-01, 6.90000000e+00,
        1.55875000e+02]])

In [33]:
from collections import Counter


sums = {i: sum(c) for i, c in enumerate(kmeans.cluster_centers_)}

count = Counter(kmeans.labels_)
widths = np.array([count[p] for p in range(kmeans.n_clusters)])
sorted_keys = list({k: v for k, v in sorted(sums.items(), key=lambda item: item[1])}.keys())
sorted_widths = [widths[k] for k in reversed(sorted_keys)]
sorted_clusters_cnt = np.array([kmeans.cluster_centers_[k] for k in reversed(sorted_keys)])

In [34]:
import plotly.graph_objects as go

from config import COLORS_D


fig = go.Figure()

for i, r in enumerate(sorted_clusters_cnt.transpose()):
    fig.add_trace(
        go.Bar(
            x=np.cumsum(sorted_widths)-sorted_widths, y=r,
            name=KEYS[i].capitalize(),
            width=sorted_widths,
            offset=0))

fig.update_layout(
    font=dict(family='Times New Roman', size=20, color='black'),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(pad=10),
    colorway=COLORS_D,
    barmode='stack',
    width=1600,
    height=900)

fig.update_xaxes(
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='Кластеры политик безопасности'),
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000', 
    showgrid=True, 
    tickangle=0,
    ticklen=10)

fig.update_yaxes(
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='Количество структурных элементов'),
    range=[0, max([sum(s) for s in scores])],
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000',
    showgrid=True, 
    tickangle=0, 
    ticklen=10)

fig

In [35]:
fig.write_image(f'{PICTURES}/structure_by_1_kmeans.png')