In [1]:
import json
import plotly.express as px
import plotly.graph_objects as go

from env import env
from html_sanitization.config import conf as html_conf
from html_sanitization.main import main as html_sanitization
from utils.fsys import make_paths


def rgba(c, a):
    return 'rgba({}, {}, {}, {})'.format(*[int(c.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)], a)


COLORS_D = [rgba(c, 1.00) for c in [
    '#f44336','#e81e63','#9c27b0','#673ab7','#3f51b5','#2196f3',
    '#03a9f4','#00bcd4','#009688','#4caf50','#8bc34a','#cddc39',
    '#ffeb3b','#ffc107','#ff9800','#ff5722']]

CONFIG = html_conf(**env())
RESOURCES = 'resources/sanitization'

stats_file = CONFIG['stats_file']

with open(stats_file, 'r') as s:
    stats = json.load(s)

make_paths([f'{RESOURCES}/figures'])

[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'resources/sanitization/statistics.json'

# FIGURE CODE

In [None]:
def figure(scores, threshold, xname, yname, picture_path, invert=False):
    start = list(filter(lambda x: x < threshold, scores.values()))
    end = list(filter(lambda x: x >= threshold, scores.values()))

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=[c for c in range(len(start))], 
            y=[c for c in list(start)], 
            name='Валидные' if invert else 'Невалидные',
            line_dash=None if invert else "dot",
            mode='lines'))
    fig.add_trace(
        go.Scatter(
            x=[c for c in range(len(start), len(start) + len(end))], 
            y=[c for c in list(end)], 
            line_dash="dot" if invert else None,
            name='Невалидные' if invert else 'Валидные',
            mode='lines'))
    fig.add_vline(x=len(start), line_width=2, line_dash="dot", line_color="red")
    fig.add_hline(y=threshold, line_width=2, line_dash="dot", line_color="red")
    fig.add_annotation(
        x=len(start),
        y=threshold,
        text=f'({len(start)}; {threshold})',
        showarrow=True,
        xanchor="right",
    )
    fig.add_annotation(
        x=len(start) + len(end),
        y=max(*start, *end),
        text=f'({len(start) + len(end)}; {round(max(*start, *end), 2)})',
        showarrow=True,
        xanchor="right",
    )


    fig.update_layout(
        font=dict(family='Times New Roman', size=20, color='black'),
        plot_bgcolor='rgba(0, 0, 0, 0)',
        margin=dict(pad=10),
        colorway=[COLORS_D[4], '#919191'] if invert else ['#919191', COLORS_D[4]],
        height=600,
        width=900)

    fig.update_xaxes(
        title=dict(
            font=dict(family='Times New Roman', size=25, color='black'),
            text=xname), 
        zerolinecolor='lightgrey',
        gridcolor='lightgrey',
        tickformat='000', 
        showgrid=True, 
        tickangle=45, 
        ticklen=10)

    fig.update_yaxes( 
        title=dict(
            font=dict(family='Times New Roman', size=25, color='black'),
            text=yname), 
        zerolinecolor='lightgrey',
        gridcolor='lightgrey',
        tickformat='000', 
        showgrid=True, 
        tickangle=45, 
        ticklen=10)
    
    fig.write_image(picture_path)
    return fig

# SHORTS

In [None]:
figure(
    stats['short']['stats'], 
    CONFIG['short_threshold'], 
    'Документы отсортированные по длине', 
    'Длина документа в символах',
    f'{RESOURCES}/figures/short.png'
)

# UPPERCASE

In [None]:
figure(
    stats['uppercase']['stats'], 
    CONFIG['uppercase_threshold'], 
    'Документы отсортированные по проценту заглавных литер', 
    'Процент заглавных литер в документе',
    f'{RESOURCES}/figures/uppercase.png',
    invert=True
)

# FOREIGN

In [None]:
figure(
    stats['foreign']['stats'], 
    CONFIG['foreign_threshold'], 
    'Документы отсортированные по проценту иноязычных литер', 
    'Процент иноязычных литер',
    f'{RESOURCES}/figures/foreign.png',
    invert=True
)

# FREQUENT N-GRAMS

In [None]:
figure(
    stats['frequent']['stats'], 
    CONFIG['frequent_threshold'], 
    'Документы отсортированные по чистоте целевых n-грамм', 
    'Суммарный вес документа по n-граммам',
    f'{RESOURCES}/figures/frequent.png'
)