In [91]:
import artm

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm

sns.set(style="darkgrid", palette="Set2")

from topicnet.dataset_manager import api

# topicnet imports
from topicnet.cooking_machine.cubes import (
    CubeCreator,
    GreedyStrategy,
    PerplexityStrategy,
    RegularizationControllerCube,
    RegularizersModifierCube,
)
from topicnet.cooking_machine.dataset import Dataset
from topicnet.cooking_machine.experiment import Experiment
from topicnet.cooking_machine.models import BaseScore
from topicnet.cooking_machine.models.topic_model import TopicModel
from topicnet.cooking_machine.model_constructor import add_standard_scores
from topicnet.cooking_machine.model_constructor import init_simple_default_model
from topicnet.cooking_machine.pretty_output import make_notebook_pretty
from topicnet.viewers.top_documents_viewer import TopDocumentsViewer
from topicnet.viewers.top_tokens_viewer import TopTokensViewer

In [20]:
EXPERIMENT_PATH = './data/exp/'

DATASET_PATH = './data/20_News_dataset/train_preprocessed.csv'

### Обучаем тематическую модель

In [21]:
if not os.path.exists(EXPERIMENT_PATH):
    os.mkdir(EXPERIMENT_PATH)
else:
    ! rm -rf $EXPERIMENT_PATH
    os.mkdir(EXPERIMENT_PATH)

In [22]:
%%time

from topicnet.cooking_machine.recipes import BaselineRecipe

training_pipeline = BaselineRecipe()

training_pipeline.format_recipe(
    dataset_path=DATASET_PATH,
    topic_number=20,
    background_topic_number=1,
)
experiment, dataset = training_pipeline.build_experiment_environment(save_path=EXPERIMENT_PATH,)


CPU times: user 17.7 s, sys: 485 ms, total: 18.2 s
Wall time: 13.9 s


In [24]:
%%time

models = experiment.run(dataset)

CPU times: user 56min 25s, sys: 9min 24s, total: 1h 5min 49s
Wall time: 31min 8s


In [27]:
final_model = list(models)[0]

### Реализум и вычисляем когерентность Ньюмана с PPMI

Метрики реализованные ниже не до конца отвечабт идеологии BigARTM.

In [142]:
from topicnet.cooking_machine.models.base_score import BaseScore
from math import log

class NewmanCoherence(BaseScore):
    def __init__(self, dataset, windowSize = 10):
        super().__init__()
        self.windowSize = windowSize
        
        def lemmatizedField2List(s):
            return s[2:-2].split("', '")

        self._documents  = [
            lemmatizedField2List(doc) for doc in dataset.get_dataset()['lemmatized']
        ]
        self.n = windowSize * sum([len(d) for d in self._documents])

    def _get_topics(self, model):
        return list(model.get_phi().columns)

    def _coh(self, u, v):
        def getWordPositions(u, text):
            return [i for i, w in enumerate(text) if u == w]

        nUV, nU, nV = 0, 0, 0
        for d in self._documents:
            uPositions = getWordPositions(u, d)
            vPositions = getWordPositions(v, d)

            for i in uPositions:
                nU += min(len(d), i + self.windowSize) - max(0, i - self.windowSize)

            for i in vPositions:
                nV += min(len(d), i + self.windowSize) - max(0, i - self.windowSize)
            
            for i in uPositions:
                for j in vPositions:
                    nUV += abs(i - j) <= self.windowSize

        return 0 if self.n * nUV <= (nU * nV) else log(self.n * nUV / (nU * nV)) #Calculating PPMI

    def _calculate_topic_coherence(self, topic, phi):
        def getTopWords(distribution, k=10):
            order = np.argsort(distribution)[::-1]
            return distribution[order][:k]
        
        distribution = phi[topic]['@lemmatized']
        topWords = getTopWords(distribution)
        # print(topWords)

        cohSum = 0
        for u, pu in topWords.items():
            for v, pv in topWords.items():
                cohSum += self._coh(u, v)
        return cohSum
                

    def call(self, model):
        topics = self._get_topics(model)
        phi = model.get_phi()
        for t in topics:
            print(t, self._calculate_topic_coherence(t, phi))


In [122]:
%%time
score = NewmanCoherence(dataset)

CPU times: user 230 ms, sys: 27.1 ms, total: 257 ms
Wall time: 148 ms


In [123]:
%%time
score.call(final_model)

topic_0 60.044523191944485
topic_1 94.73637995520384
topic_2 122.65889943512931
topic_3 29.067204408256398
topic_4 127.21952188160549
topic_5 213.71734243693126
topic_6 196.7035738379648
topic_7 94.72148751563833
topic_8 64.32479947188925
topic_9 26.843900707918813
topic_10 104.70641318908643
topic_11 72.28746655695996
topic_12 93.1461288502546
topic_13 25.10779868294018
topic_14 72.14282077516681
topic_15 53.36126222314763
topic_16 154.88312767987208
topic_17 150.53446814216437
topic_18 90.34491375863998
topic_19 59.14516968328842
bcg_20 34.15284011652685
CPU times: user 8min 18s, sys: 2min 6s, total: 10min 25s
Wall time: 5min 49s


### Реализум и вычисляем интерпретируемость

In [183]:
from topicnet.cooking_machine.models.base_score import BaseScore
from math import log

class ChainsInterpretability(BaseScore):
    def __init__(self, chains, windowSize = 10):
        super().__init__()
        self.chains = chains

    def _get_topics(self, model):
        return list(model.get_phi().columns)
    
    def _calculate_interpretability(self, topics, phi):
        result = pd.Series(np.zeros(len(topics)), index=topics)
        for chain in self.chains:
            probs = phi.loc['@lemmatized'].loc[chain].to_numpy()
            likelyhood = np.log(probs).sum(axis=0)
            optimalTopic = topics[np.argmax(likelyhood)]
            result[optimalTopic] += np.max(likelyhood)
        return result

    def call(self, model):
        topics = self._get_topics(model)
        phi = model.get_phi()
        result = self._calculate_interpretability(topics, phi)
        for t, m in result.items():
            print(t, m)

In [194]:
%%time
score = ChainsInterpretability([
    ['sport', 'car'],
    ['front', 'bumper'],
    ['floppy', 'disk'],
    ['fix', 'code']
])

CPU times: user 9 µs, sys: 17 µs, total: 26 µs
Wall time: 39.8 µs


In [195]:
%%time
score.call(final_model)

topic_0 -11.720755577087402
topic_1 0.0
topic_2 0.0
topic_3 0.0
topic_4 0.0
topic_5 0.0
topic_6 -7.897364616394043
topic_7 0.0
topic_8 0.0
topic_9 -14.651784896850586
topic_10 0.0
topic_11 -12.029800415039062
topic_12 0.0
topic_13 0.0
topic_14 0.0
topic_15 0.0
topic_16 0.0
topic_17 0.0
topic_18 0.0
topic_19 0.0
bcg_20 0.0
CPU times: user 11.8 s, sys: 2.64 s, total: 14.4 s
Wall time: 8.32 s


### Expected results

Здесь мы считаем наши метрики и строим корреляцию между ними и интерпретируемостью

In [141]:
from IPython.display import Image 
Image(url="../expected_results.png", width=500, height=500)