In [1]:
# add default values for parameters here

In [2]:
# Parameters
product = "notebooks/structure_heatmap.ipynb"


In [3]:
import __path__

In [4]:
from config import MAIN_EXPERIMENT


DESCRIPTOR = MAIN_EXPERIMENT.dataset.descriptor
DOCUMENTS = MAIN_EXPERIMENT.dataset.documents
PICTURES = MAIN_EXPERIMENT.pictures

In [5]:
import plotly.graph_objects as go
import numpy as np

from config import COLORS_D

i = 0
def plot(z, m, s, t, type_='linear'):
    fig = go.Figure()

    fig.add_trace(
        go.Heatmap(
            x=[int((i+1)*s+m) for i in range(len(z))], y=[''], z=[z],
            colorscale=[[0.0, COLORS_D[4]], [1.0, '#f0f0f0']],
            colorbar=dict(
                lenmode='fraction', 
                len=2.25, 
                thickness=20,
                ticks="outside",
                tick0=0,
                tickformat='000',
                dtick=max(z)//5)))

    fig.update_layout(
        font=dict(family='Times New Roman', size=30, color='black'),
        plot_bgcolor='rgba(0, 0, 0, 0)',
        title=t,
        margin=dict(pad=10),
        showlegend=False,
        height=250,
        width=800)

    fig.update_xaxes(
        tickformat='000',
        tickmode='array',
        tick0=0,
        tickvals=np.arange(m, len(z)*s+m, s*5),
        tickangle=65,
        type=type_)

    fig.update_yaxes(
        side="left",
        title=dict(
            font=dict(family='Times New Roman', size=25, color='black')),
        tickformat='000',
        tickangle=0,)

    global i
    i += 1
    fig.write_image(f'{PICTURES}/structure_heatmap_{type_}{i}.png')
    return fig

In [6]:
from src.utils import check_files


fs = check_files(DOCUMENTS, r'.*')

In [7]:
documents_lengths = []
for f in fs:
    with open(f, 'r', encoding='utf-8') as fl:
        documents_lengths.append(len(fl.read()))

In [8]:
paragraphs_lengths = []
for f in fs:
    with open(f, 'r', encoding='utf-8') as fl:
        paragraphs_lengths.extend([len(p) for p in fl.read().split('\n\n')])

In [9]:
import json

from src.utils import resolve_stats



with open(DESCRIPTOR, 'r', encoding='utf-8') as f:
    stats = json.load(f)

hashes = []
unique_stats = []
for s in stats:
    if s['policy_hash'] not in hashes and s['statistics'] is not None:
        hashes.append(s['policy_hash'])
        unique_stats.append(resolve_stats(s['statistics']))

In [10]:
doc_len = sum(documents_lengths) / len(fs)
par_len = sum(paragraphs_lengths) / len(fs)
heads = sum([stat['headings'] for stat in unique_stats]) / len(fs)
pars = sum([stat['paragraphs'] for stat in unique_stats]) / len(fs)
tables = sum([stat['tables'] for stat in unique_stats]) / len(fs)
ols = sum([stat['ordered lists'] for stat in unique_stats]) / len(fs)
uls = sum([stat['unordered lists'] for stat in unique_stats]) / len(fs)

print(f'{doc_len=}')
print(f'{par_len=}')
print(f'{heads=}')
print(f'{pars=}')
print(f'{tables=}')
print(f'{ols=}')
print(f'{uls=}')

s = heads + pars + tables + ols + uls

aheads = heads / s
apars = pars / s
atables = tables / s
aols = ols / s
auls = uls / s

print(f'{aheads=}')
print(f'{apars=}')
print(f'{atables=}')
print(f'{aols=}')
print(f'{auls=}')

print(9.5 + 85.1 + 0.5 + 2.2 + 2.7)

doc_len=11053.301864181092
par_len=10928.099733688416
heads=3.5287616511318243
pars=36.87829560585885
tables=0.16524633821571239
ols=0.9952063914780293
uls=1.196005326231691
aheads=0.08251804430273328
apars=0.8623775509568619
atables=0.0038641897656575977
aols=0.02327232418092255
auls=0.027967890793824772
100.0


In [11]:
from src.utils import make_hist3


heats = make_hist3(documents_lengths, 100, 500)
fig = plot(*heats, 'Длины документов')
fig

In [12]:
heats = make_hist3(paragraphs_lengths, 100, 10)
fig = plot(*heats, 'Длины параграфов')
fig

In [13]:
heats = make_hist3([stat['headings'] for stat in unique_stats][1:], 75, 1)
fig = plot(*heats, 'Заголовки')
fig

In [14]:
heats = make_hist3([stat['paragraphs'] for stat in unique_stats][1:], 75, 10)
fig = plot(*heats, 'Параграфы')
fig

In [15]:
heats = make_hist3([stat['tables'] for stat in unique_stats][1:], 75, 1)
fig = plot(*heats, 'Таблицы')
fig

In [16]:
heats = make_hist3([stat['unordered lists'] for stat in unique_stats][1:], 75, 1)
fig = plot(*heats, 'Ненумерованные списки')
fig

In [17]:
heats = make_hist3([stat['ordered lists'] for stat in unique_stats][1:], 75, 1)
fig = plot(*heats, 'Нумерованные списки')
fig