# Similarity Distributions

In [3]:
CLR = {
    'blue': ['#e0f3ff', '#aadeff', '#2bb1ff', '#15587f', '#0b2c40'],
    'gold': ['#fff3dc', '#ffebc7', '#ffddab', '#b59d79', '#5C4938'],
    'red':  ['#ffd8e8', '#ff9db6', '#ff3e72', '#6B404C', '#521424'],
}

from ungol.wmd import wmd
from ungol.retrieval import clients

import numpy as np

import pickle
import pathlib

TODO: loading config from ...ca/ungol-models/conf/logging.conf


In [4]:
basedir = pathlib.Path('../../ungol-es/opt')
f_codemap = str(basedir / 'src/codemap.bin')
f_vocab = str(basedir / 'src/fasttext.de.vocab.pickle')
ref = wmd.DocReferences.from_files(f_codemap, f_vocab, )

In [5]:
folder = pathlib.Path('../opt/frozen')

f_ungol = 'tmp'
f_ungol_exp = 'ungol_reranking_sp_rhwmd.pickle'
f_ungol_report = 'ungol_reranking_sp_rhwmd.sum_wmd.pickle'

with (folder / f_ungol / f_ungol_report).open('rb') as fd:
    report = pickle.load(fd)

## Raw Similarities

In [6]:
sims1, sims2 = [], []
for dic in report.values():
    for sdata in dic.values():
        a, b = dict(sdata.local_columns)['sim']
        sims1 += list(a)
        sims2 += list(b)

sims = np.array(sims1), np.array(sims2)

print('q -> d', len(sims[0]), sims[0].mean())
print('d -> q', len(sims[1]), sims[1].mean())

q -> d 224500 0.8550819181514476
d -> q 3762643 0.7072605539925658


## Raw Weighted Similarities

In [7]:
w1, w2 = [], []
for dic in report.values():
    for sdata in dic.values():
        a, b = dict(sdata.local_columns)['weight']
        w1 += list(a)
        w2 += list(b)

w = np.array(w1), np.array(w2)

print('q -> d', len(w[0]), w[0].mean())
print('d -> q', len(w[1]), w[1].mean())

q -> d 224500 0.04998753531776294
d -> q 3762643 0.0025698262226867283


## Plotting Similarity Distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
def _rolling_mean(a: np.array, window: int):
    assert len(a.shape) == 1
    
    a_prev = np.repeat(a[0], window // 2)
    a_post = np.repeat(a[-1], window // 2 - 1)
    
    x = np.concatenate((a_prev, a, a_post))
    v = np.ones((window, )) / window
    
    return np.convolve(x, v, mode='valid')

def _fig_before(title: str):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set_title(title)
        ax.set_xlabel('Similarity')
        ax.set_ylabel('Count')
        return fig, ax

def _fig_after(fig, ax, patches = None, fname = None):
    ax.legend(handles=patches)
    
    if display:
        plt.show(fig)
    if fname:
        for out_file in [str(out_dir/fname) + s for s in ('.png', '.svg')]:
            print('saving to', out_file)
            fig.savefig(out_file)

    plt.close(fig)

def _plot(ax, x, data, lim):
    color = CLR['blue'][3]
    
    ax.set_xlim(right=lim)
    
    smoothed = _rolling_mean(data, 10)
    ax.fill_between(x, 0, smoothed, color=color, alpha=0.2)
    ax.plot(x, smoothed, color=color, alpha=0.5)
    ax.bar(x, data, color=color, alpha=0.7, width=0.00025*len(data)*lim)
    
for i, name in ((0, 'Similarity Query > Document'), (1, 'Similarity Document > Query')):
    data, bin_edges = np.histogram(sims[i], bins=50)
    fig, ax = fig_before(name)
    _plot(ax, bin_edges[1:], data, lim=1.1)
    fig_after(fig, ax)


for i, name in ((0, 'IDF Weighted Query > Document'), (1, 'IDF Weighted Document > Query')):
    data, bin_edges = np.histogram(w[i], bins=50)
    fig, ax = _fig_before(name)
    _plot(ax, bin_edges[1:], data, lim=0.2)
    _fig_after(fig, ax)

## Unique Tokens per Document

In [6]:
tokens = []
for dic in report.values():
    for sdata in dic.values():
        _, token = dict(sdata.local_columns)['token']
        tokens.append(len(token))

a_tokens = np.array(tokens)
print('mean', a_tokens.mean(), 'std', a_tokens.std())

mean 268.7602142857143 std 196.32228423825327
