In [1]:
# add default values for parameters here

In [2]:
# Parameters
product = "notebooks/cluster_by_1_kmeans.ipynb"


In [3]:
import __path__

In [4]:
from config import MAIN_EXPERIMENT

MAX_SCORE, WIDTH = MAIN_EXPERIMENT.nb_args[product]
KMEANS_HYPERPARAMS = MAIN_EXPERIMENT.kmeans_hyperparams
AFFILIATION_TH = MAIN_EXPERIMENT.affiliation_theshold
PREPROCESSED_FILE = MAIN_EXPERIMENT.preprocessed_file
MAIN_MODEL = MAIN_EXPERIMENT.main_model
PICTURES = MAIN_EXPERIMENT.pictures
GROUPS = MAIN_EXPERIMENT.groups

In [5]:
import json


with open(PREPROCESSED_FILE, 'r') as f:
    preprocessed = json.load(f)

FILES_CNT = len(preprocessed)

In [6]:
from gensim.models import TfidfModel

from src.utils import load


lda, dictionary, corpus = load(MAIN_MODEL)
tfidf = TfidfModel(corpus, id2word=dictionary)

In [7]:
import numpy as np


scores = np.zeros((FILES_CNT, len(GROUPS)), dtype='int')

for i, policy in enumerate(preprocessed):
    for paragraph in policy:
        topics = [t for t, s in lda[tfidf[dictionary.doc2bow(paragraph)]] if s > AFFILIATION_TH]

        for g in GROUPS:
            for t in topics:
                if t in g['topics']:
                    scores[i, g['id']] += 1

In [8]:
scores = np.array([s for s in scores if sum(s) < MAX_SCORE])

scores[:5]

array([[ 4,  6,  6,  1, 23,  4,  2,  1,  0,  2],
       [ 6, 10,  7,  6, 35,  2, 10,  2,  6,  1],
       [ 7,  1,  1,  1, 15,  5,  4,  1,  3,  3],
       [ 4,  5,  6,  5, 12,  2,  8,  1,  2,  3],
       [ 1,  3,  6,  1, 23,  3,  6,  0,  3,  3]])

In [9]:
from sklearn.cluster import KMeans


kmeans = KMeans(**KMEANS_HYPERPARAMS)

In [10]:
kmeans.fit(scores)

In [11]:
kmeans.cluster_centers_[:2]

array([[ 4.        ,  1.        ,  0.        ,  0.        , 13.        ,
         3.        , 69.        ,  1.        ,  4.        ,  2.        ],
       [ 2.35135135,  1.02702703,  1.18918919,  0.45945946,  3.67567568,
         2.24324324,  1.94594595,  0.48648649,  0.37837838,  0.91891892]])

In [12]:
from collections import Counter


sums = {i: sum(c) for i, c in enumerate(kmeans.cluster_centers_)}

count = Counter(kmeans.labels_)
widths = np.array([count[p] for p in range(kmeans.n_clusters)])
sorted_keys = list({k: v for k, v in sorted(sums.items(), key=lambda item: item[1])}.keys())
sorted_widths = [widths[k] for k in reversed(sorted_keys)]
sorted_clusters = np.array([kmeans.cluster_centers_[k] for k in reversed(sorted_keys)])

In [13]:
sorted_widths[:10]

[4, 8, 5, 8, 1, 8, 11, 6, 3, 3]

In [14]:
import plotly.graph_objects as go
from textwrap import wrap

from src.utils import resolve_group_name
from config import COLORS_B, COLORS_D


fig = go.Figure()

for i, r in reversed(list(enumerate(sorted_clusters.transpose()))):
    fig.add_trace(
        go.Bar(
            x=np.cumsum(sorted_widths)-sorted_widths, y=r,
            name='<br>'.join(wrap(resolve_group_name(i, reversed(GROUPS)), width=WIDTH)),
            width=sorted_widths,
            offset=0))

fig.update_layout(
    legend={'traceorder':'normal'},
    font=dict(family='Times New Roman', size=30, color='black'),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(pad=10),
    colorway=COLORS_D,
    barmode='stack',
    width=1600,
    height=950)

fig.update_xaxes(
    title=dict(
        font=dict(family='Times New Roman', size=30, color='black'),
        text='Кластеры политик безопасности'), 
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000', 
    showgrid=True, 
    tickangle=0,
    ticklen=10)

fig.update_yaxes(
    title=dict(
        font=dict(family='Times New Roman', size=30, color='black'),
        text='Распределение аспектов политик'), 
    range=[0, max([sum(s) for s in scores])], 
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000',
    showgrid=True, 
    tickangle=0, 
    ticklen=10)

fig

In [15]:
fig.write_image(f'{PICTURES}/cluster_by_1.png')