In [None]:
import os
import json
import numpy as np
from scipy.stats import kendalltau
from scipy.spatial import ConvexHull
from scipy.spatial.qhull import QhullError
from scipy.spatial.distance import cdist, pdist
from operator import itemgetter
from itertools import chain
from lexrank import degree_centrality_scores

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import ray

In [None]:
DATASET = 'TAC2008'
TOPICS = ['D0841', 'D0804', 'D0802', 'D0809', 'D0819', 'D0825', 'D0828', 'D0826', 'D0843', 'D0829', 'D0813', 'D0807', 'D0812', 'D0820', 'D0835', 'D0823', 'D0847', 'D0848', 'D0810', 'D0822', 'D0845', 'D0844', 'D0839', 'D0814', 'D0824', 'D0821', 'D0827', 'D0846', 'D0818', 'D0834', 'D0805', 'D0817', 'D0831', 'D0815', 'D0836', 'D0806', 'D0808', 'D0837', 'D0803', 'D0830', 'D0838', 'D0840', 'D0842', 'D0832', 'D0816', 'D0801', 'D0833', 'D0811']
DATA_DIR = f'/scratch/korunosk/data/{DATASET}'

In [None]:
ray.init(num_cpus=40)

In [None]:
@ray.remote
def load_and_extract(t):
    with open(os.path.join(DATA_DIR, t + '_encoded.json'), mode='r') as fp:
        topic = json.load(fp)
    
    documents = np.array(list(chain(*topic['documents'])))
    annotations = topic['annotations']

    summaries_tmp = list(map(itemgetter('text'), annotations))
    indices_tmp = np.cumsum([0] + list(map(len, summaries_tmp)))
    summaries = np.array(list(chain(*summaries_tmp)))
    indices = np.array(list(zip(indices_tmp[:-1], indices_tmp[1:])))
    
    pyr_scores = np.array(list(map(itemgetter('pyr_score'), annotations)))
    
    return documents, summaries, indices, pyr_scores


@ray.remote
def experiment_average_pairwise_distance(data):
    document_embs, summary_embs, indices, pyr_scores = data
    
    def average_pairwise_distance(summary_embs: np.array) -> float:
        '''Calculates the average pairwise distance between summary embeddings'''
        return np.mean(cdist(summary_embs, summary_embs, metric='euclidean'))
    
    metric = lambda i: average_pairwise_distance(summary_embs[i[0]:i[1]])
    
    return kendalltau(pyr_scores, np.array([metric(i) for i in indices]))[0]

@ray.remote
def experiment_semantic_volume(data):
    document_embs, summary_embs, indices, pyr_scores = data
    
    embs = np.concatenate((document_embs, summary_embs))
    t = document_embs.shape[0]
    pca = PCA(n_components=2, random_state=42)
    pts = pca.fit_transform(embs)
    document_pts, summary_pts = pts[:t], pts[t:]
    
    def semantic_volume(summary_embs: np.array) -> float:
        '''Calculates the semantic volume of the summary embeddings'''
        try:
            return ConvexHull(summary_embs).volume
        except QhullError as e:
            return 0
    
    metric = lambda i: semantic_volume(summary_pts[i[0]:i[1]])
    
    return kendalltau(pyr_scores, np.array([metric(i) for i in indices]))[0]

@ray.remote
def experiment_semantic_spread(data):
    document_embs, summary_embs, indices, pyr_scores = data

    def semantic_spread(summary_embs: np.array) -> float:
        '''Calculates the semantic spread of the summary embeddings'''
        return np.linalg.det(summary_embs @ summary_embs.T)

    metric = lambda i: semantic_spread(summary_embs[i[0]:i[1]])
    
    return kendalltau(pyr_scores, np.array([metric(i) for i in indices]))[0]


def execute_experiment(experiment):
    data   = [ load_and_extract.remote(t) for t in TOPICS ]
    scores = [ experiment.remote(d) for d in data ]

    return np.array(ray.get(scores))

scores = execute_experiment(experiment_semantic_spread)

print(np.mean(scores))

In [None]:
x = np.arange(len(TOPICS))

fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(1,1,1)
ax.bar(x, scores, width=0.2, label='Semantic Spread')
ax.set_xticks(x)
ax.set_xticklabels(TOPICS, rotation=90)
ax.set_title(DATASET)
ax.set_xlabel('topic')
ax.set_ylabel('kendalltau')
ax.legend()
plt.show()