## Run in Google Colab

1. Upload config.py
2. Make 'data' directory
3. Upload TAC2008.json and TAC2009.json in 'data'

In [1]:
# !pip install bert-serving-client
# !pip install -U bert-serving-server[http]

In [2]:
# !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# !mkdir models
# !unzip uncased_L-12_H-768_A-12.zip -d models

In [None]:
# !nohup bert-serving-start -model_dir models/uncased_L-12_H-768_A-12/ -num_worker=4 -max_seq_len=160 -max_batch_size=50 -show_tokens_to_client > out.file 2>&1 &

In [3]:
import os
import json
import numpy as np
from operator import itemgetter
from itertools import chain

from bert_serving.client import BertClient

from config import *

In [4]:
def load_data(dataset):
    with open(os.path.join(DATA_DIR, dataset + '.json'), mode='r') as fp:
        return json.load(fp)

def store_tac(dataset, tac):
    with open(os.path.join(DATA_DIR, dataset + '_encoded.json'), mode='w') as fp:
        json.dump(tac, fp, indent=4)

def encode(docs):
    docs = list(map(lambda doc: list(filter(len, doc)), docs)) # remove empty sentences in summary
    n = np.cumsum([0] + list(map(len,docs)))
    embs = bc.encode(list(chain(*docs)))
    return [embs[i:j] for i,j in zip(n[:-1],n[1:])]

def make_annotations(scrs, embs):
    return [{'pyr_score': s, 'embedding': e} for s,e in zip(scrs, embs)]

def make_tac(data):
    tac = {}

    for topic_id, topic in data.items():
        print('  ',topic_id)

        documents = topic['documents']
        annotations = topic['annotations']

        summaries = list(map(itemgetter('text'), annotations))
        pyr_scores = list(map(itemgetter('pyr_score'), annotations))

        tac[topic_id] = {
            'documents': encode(documents),
            'annotations': make_annotations(pyr_scores, encode(summaries))
        }
    
    return tac

In [5]:
bc = BertClient(output_fmt='list')

In [6]:
for dataset in DATASETS:
    print(dataset)
    data = load_data(dataset)
    tac = make_tac(data)
    store_tac(dataset, tac)

In [None]:
# !zip data/TAC2008_encoded.zip data/TAC2008_encoded.json
# !zip data/TAC2009_encoded.zip data/TAC2009_encoded.json