In [14]:
# add default values for parameters here

In [15]:
# Parameters
product = "notebooks/structure_by_1_kmeans.ipynb"


In [16]:
import __path__

In [17]:
from config import MAIN_EXPERIMENT


MAX_SCORE = MAIN_EXPERIMENT.nb_args[product]
KMEANS_HYPERPARAMS = MAIN_EXPERIMENT.kmeans_hyperparams
DESCRIPTOR = MAIN_EXPERIMENT.dataset.descriptor
PICTURES = MAIN_EXPERIMENT.pictures

In [18]:
import json

from src.utils import resolve_stats


KEYS = ['Таблица', 'Ненум. список', 'Нум. список', 'Заголовок', 'Параграф']
ALLOWED = ['tables', 'unordered lists', 'ordered lists', 'headings', 'paragraphs']


with open(DESCRIPTOR, 'r', encoding='utf-8') as f:
    stats = json.load(f)

hashes = []
unique_stats = []
for s in stats:
    if s['policy_hash'] not in hashes and s['statistics'] is not None:
        hashes.append(s['policy_hash'])
        unique_stats.append(resolve_stats(s['statistics']))
        
unique_stats[:2]

[{'length': 8472,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 6,
  'headings': 0,
  'paragraphs': 10},
 {'length': 8493,
  'tables': 0,
  'ordered lists': 0,
  'unordered lists': 0,
  'headings': 9,
  'paragraphs': 34}]

In [19]:
import numpy as np


scores = np.zeros((len(unique_stats), 5), dtype='int')

for id, us in enumerate(unique_stats):
    for i, k in enumerate(ALLOWED):
        scores[id, i] = us[k]

scores = np.array([s for s in scores if sum(s) < MAX_SCORE])

In [20]:
from sklearn.cluster import KMeans


kmeans = KMeans(**KMEANS_HYPERPARAMS)

In [21]:
kmeans.fit(scores)

In [22]:
from collections import Counter


count = Counter(kmeans.labels_)

In [23]:
kmeans.cluster_centers_[:5]

array([[4.45205479e-02, 3.80136986e-01, 1.71232877e-02, 9.75684932e+00,
        5.60171233e+01],
       [9.75609756e-02, 6.17073171e+00, 9.26829268e-01, 6.87804878e+00,
        1.61560976e+02],
       [2.40963855e-02, 7.28915663e-01, 1.62650602e-01, 6.62650602e-01,
        4.27289157e+01],
       [1.24096386e-01, 2.10843373e-01, 2.08433735e-01, 3.92771084e-01,
        1.21566265e+00],
       [6.85920578e-02, 5.48736462e-01, 8.14801444e+00, 5.09025271e+00,
        2.49097473e+00]])

In [24]:
from collections import Counter


sums = {i: sum(c) for i, c in enumerate(kmeans.cluster_centers_)}

count = Counter(kmeans.labels_)
widths = np.array([count[p] for p in range(kmeans.n_clusters)])
sorted_keys = list({k: v for k, v in sorted(sums.items(), key=lambda item: item[1])}.keys())
sorted_widths = [widths[k] for k in reversed(sorted_keys)]
sorted_clusters_cnt = np.array([kmeans.cluster_centers_[k] for k in reversed(sorted_keys)])

In [25]:
import plotly.graph_objects as go

from config import COLORS_D


fig = go.Figure()

for i, r in enumerate(sorted_clusters_cnt.transpose()):
    fig.add_trace(
        go.Bar(
            x=np.cumsum(sorted_widths)-sorted_widths, y=r,
            name=KEYS[i].capitalize(),
            width=sorted_widths,
            offset=0))

fig.update_layout(
    font=dict(family='Times New Roman', size=20, color='black'),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(pad=10),
    colorway=COLORS_D,
    barmode='stack',
    width=1600,
    height=900)

fig.update_xaxes(
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='Кластеры структуры документов'),
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000', 
    showgrid=True, 
    tickangle=45, 
    ticklen=10)

fig.update_yaxes(
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='Количество структурных элементов'),
    range=[0, max([sum(s) for s in scores])],
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000',
    showgrid=True, 
    tickangle=45, 
    ticklen=10)

fig

In [26]:
fig.write_image(f'{PICTURES}/structure_by_1_kmeans.png')