In [1]:
import json
from impresso_commons.path.path_s3 import IMPRESSO_STORAGEOPT
from impresso_images.data_utils import fixed_s3fs_glob
from dask import bag as db
# from tabulate import tabulate
#import matplotlib
#from matplotlib import pyplot as plt

In [2]:
import os
from dask_k8 import DaskCluster
from dask import bag as db
from impresso_commons.path.path_s3 import read_s3_issues
from impresso_commons.text.rebuilder import rebuild_issues, compress, upload, cleanup

In [3]:
import dask

In [4]:
dask.__version__

'1.1.5'

## Set up dask kubernetes cluster

In [49]:
kube_cfg = """
  containers:
    - image: daskdev/dask:1.1.5
      args: [dask-worker, $(DASK_SCHEDULER_ADDRESS), --nthreads, '1', --no-bokeh, --memory-limit, 20GB, --death-timeout, '120']
      imagePullPolicy: Always
      name: dask-worker
      env:
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: EXTRA_PIP_PACKAGES
          value: https://github.com/impresso/impresso-pycommons/archive/master.zip
        - name: EXTRA_CONDA_PACKAGES
          value:
        - name: SE_ACCESS_KEY
          value: {}
        - name: SE_SECRET_KEY
          value: {}
      resources:
        requests:
          cpu: 1
          memory: "20G"
        limits:
          cpu: 1
          memory: "20G"
      volumeMounts:
        - mountPath: /scratch
          name: scratch
          subPath: romanell
  volumes:
    - name: scratch
      persistentVolumeClaim:
        claimName: dhlab-scratch
""".format(
    os.environ["SE_ACCESS_KEY"],
    os.environ["SE_SECRET_KEY"]
)

In [53]:
cluster = DaskCluster(namespace="dhlab", cluster_id="matteo-entity-linking", worker_pod_spec=kube_cfg)

In [54]:
cluster.create()

Scheduler: tcp://10.90.47.20:30758
Dashboard: http://10.90.47.20:7596


In [51]:
cluster.close()

In [55]:
cluster.scale(100, blocking=False)

In [56]:
dask_client = cluster.make_dask_client()

In [69]:
dask_client

0,1
Client  Scheduler: tcp://10.90.47.20:30758  Dashboard: http://10.90.47.20:8787/status,Cluster  Workers: 50  Cores: 50  Memory: 1000.00 GB


## Process named entity mentions

In [70]:
# files = fixed_s3fs_glob(f"s3://processed-canonical-data/ne-mentions/public_release/*.bz2")
files = fixed_s3fs_glob(f"s3://processed-canonical-data/mentions_public/*.bz2")

In [71]:
len(files)

2197

In [72]:
mentions_bag = db.read_text(
    files,
    storage_options=IMPRESSO_STORAGEOPT
).map(json.loads).persist()

In [27]:
dask_client.cancel(mentions_bag)

In [74]:
mentions_bag.take(1)

({'id': 'LSE-1897-09-25-a-i0032',
  's3v': None,
  'ts': '2019-06-17T11:00:30Z',
  'sys_id': 'nerb-0001',
  'nes': [{'type': 'Person',
    'surface': 'RUE DU GRENIER',
    'name': 'RUE DU GRENIER',
    'firstname': 'RUE',
    'surname': 'DU GRENIER',
    'lOffset': 146,
    'rOffset': 160,
    'mrule': 'person1_basic_6_passed',
    'confidence': 'high',
    'id': 'LSE-1897-09-25-a-i0032:146:160:person:nerb-0001'}]},)

In [76]:
mentions_bag.count().compute()

30233580

### try join between dataframes

Merging between dataframes is more performant than between bags.

In [77]:
files = fixed_s3fs_glob(f"s3://processed-canonical-data/mentions_public/*.bz2")

In [78]:
mentions_df = db.read_text(
    files,
    storage_options=IMPRESSO_STORAGEOPT
).map(json.loads).to_dataframe().set_index('id')

In [79]:
mentions_df.index

Dask Index Structure:
npartitions=2197
BDC-1839-01-20-a-i0001          object
CDV-1843-03-04-a-i0001             ...
                                 ...  
waeschfra-1884-01-06-a-i0016       ...
waeschfra-1884-07-05-a-i0017       ...
Name: id, dtype: object
Dask Name: sort_index, 123032 tasks

In [80]:
# s3://evenized-light-canonical-rebuilt-pubrelease
rebuilt_files = fixed_s3fs_glob("s3://evenized-light-canonical-rebuilt-pubrelease/*.bz2")
rebuilt_df = db.read_text(
    rebuilt_files,
    storage_options=IMPRESSO_STORAGEOPT
).map(json.loads).to_dataframe().set_index('id')

In [81]:
rebuilt_df.index

Dask Index Structure:
npartitions=2197
BDC-1839-01-20-a-i0001          object
CDV-1843-02-22-a-i0019             ...
                                 ...  
waeschfra-1884-02-24-a-i0021       ...
waeschfra-1884-07-05-a-i0021       ...
Name: id, dtype: object
Dask Name: sort_index, 123032 tasks

In [82]:
merged_dfs = mentions_df.merge(rebuilt_df, left_index=True, right_index=True)

In [22]:
merged_dfs.head()

KeyboardInterrupt: 

In [83]:
merged_bag = merged_dfs[['ft', 'nes', 'sys_id']].to_bag(index=True)

In [84]:
def prepare_input(ci_id, fulltext, mentions, system_id):
    start = 0
    prepared_input = ""
    
    for mention in mentions:
        
        if isinstance(fulltext, float):
            continue
        
        if isinstance(mention, float):
            continue
        
        prepared_input += fulltext[start:mention['lOffset']]
        prepared_input += f"[[{fulltext[mention['lOffset']:mention['rOffset']]}]]"
        start = mention['rOffset']
    return {
        "id": ci_id,
        "ft": fulltext,
        "input": prepared_input,
        "nes": mentions,
        "sys_id": system_id
    }
        

In [85]:
prepared_input = merged_bag.starmap(prepare_input).map(json.dumps).to_textfiles(
    f"s3://processed-canonical-data/entities/aida-input/*.bz2",
    storage_options=IMPRESSO_STORAGEOPT
)

In [30]:
len(prepared_input)

3198

In [35]:
cluster.close()

tornado.application - ERROR - Exception in callback <bound method Client._heartbeat of <Client: scheduler='tcp://10.233.78.196:8786' processes=0 cores=0>>
Traceback (most recent call last):
  File "/home/romanell/.local/share/virtualenvs/impresso-ne-linking-3S6qaXXb/lib/python3.6/site-packages/tornado/ioloop.py", line 907, in _run
    return self.callback()
  File "/home/romanell/.local/share/virtualenvs/impresso-ne-linking-3S6qaXXb/lib/python3.6/site-packages/distributed/client.py", line 1055, in _heartbeat
    self.scheduler_comm.send({"op": "heartbeat-client"})
  File "/home/romanell/.local/share/virtualenvs/impresso-ne-linking-3S6qaXXb/lib/python3.6/site-packages/distributed/batched.py", line 119, in send
    raise CommClosedError
distributed.comm.core.CommClosedError
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - 

## try disambiguating

**NB**: `findExtractedMentions()` doesn't seem to work when the character preceding a mention is an apostrophe (e.g. l'[[Egypte]]).

In [5]:
from py4j.java_gateway import JavaGateway
gateway = JavaGateway()

aida_server = gateway.entry_point.getAida()

In [214]:
x = prepared_input.take(21, npartitions=100)[-1]

In [215]:
test_sentence = x['input']

In [216]:
x['input']

"RÉPUBLIQUE [[SUISSE]]. De St. Gall, le 28 juillet. L'on recommence de nouveau, d'après le système ifincamération trichienne-, à procéder contre toutes les propriétés helvétiques situées de l'autre côté du Rhin, & déjà les possessions des communes du Rheinthal, qui sont d'une valeur très-considérable, viennent d'être séquestrées. De Berne, le 29 juillet. Parmi le grand nombre de voyageurs de marque qui ont passé ici, l'on distingue le général Sebastiani, connu par son rapport sur l'[[Egypte]]. Il n'a fait qu'un très-petit séjour dans notre ville, qu'il a en majeure partie . passé chez son excellence le ministre français. Il y a eu en outre plusieurs suédois, danois & allemands, qui tous sont allés voir l'institut de Pestalozzi, à Bouchsée. — Dans la séance de la diète du 14, à l'occasion de la discussion sur les troubles du canton de [[Zurich]], la proposition de faire une loi organique en explication du 20. art. de fafte de médiation, a été mise à l'ordre du jour. Cet article est de l

In [217]:
mentions = gateway.entry_point.findExtractedMentions(test_sentence)
annotations = aida_server.disambiguate(test_sentence, mentions, "fullSettings")

In [218]:
for mention in annotations.keySet():
    wikipedia_entity_id = f"http://en.wikipedia.org/wiki/{annotations.get(mention).getName()}"
    print(mention.getMention(), wikipedia_entity_id)

Zurich http://en.wikipedia.org/wiki/Zürich
SUISSE http://en.wikipedia.org/wiki/Suisse,_Moselle
Fribourg http://en.wikipedia.org/wiki/Fribourg


In [166]:
prepared_input.count().compute()

4990

In [220]:
cluster.close()

## Disambiguation test

In [5]:
from dask.distributed import Client, progress
from py4j.java_gateway import JavaGateway
from time import strftime

In [6]:
dask_client = Client()

In [7]:
dask_client

0,1
Client  Scheduler: tcp://127.0.0.1:36657,Cluster  Workers: 4  Cores: 8  Memory: 33.65 GB


In [17]:
gateway = JavaGateway()

In [46]:
def aida_disambiguate_documents(documents):
    
    # do py4j business
    gateway = JavaGateway()
    aida_server = gateway.entry_point.getAida()
    
    output = []
    
    for document in documents:
    
        prepared_input = document['input'] 
        mentions = gateway.entry_point.findExtractedMentions(prepared_input)
        annotations = aida_server.disambiguate(prepared_input, mentions, "fullSettings")

        output_doc = {
            "id": document['id'],
            "sys_id": "aidalight-fullSettings",
            "cdt": strftime("%Y-%m-%d %H:%M:%S")
        }
        linked_entities = []

        for mention in annotations.keySet():

            annotation = annotations.get(mention)
            entity_id = annotation.getName()
            wikipedia_entity_id = f"http://en.wikipedia.org/wiki/{entity_id}"
            char_offset = mention.getCharOffset()
            char_length = mention.getCharLength()

            # uncomment for debug
            #print(document['ft'][char_offset:char_offset + char_length])
            #print(mention.getMention(), wikipedia_entity_id)

            try:
                matching_mention = [
                    ne
                    for ne in document['nes']
                    if ne['lOffset'] == char_offset and
                    ne['rOffset'] == char_offset + char_length
                ][0]
            except:
                print(f"error in {document['id']}")
                continue

            ci_id = document['id']
            start = matching_mention['lOffset']
            end  = matching_mention['rOffset']
            ne_type = matching_mention['type']
            system = document['sys_id']
            mention_id = matching_mention['id']

            linked_entities.append(
                {
                    "mention_id": mention_id,
                    "surface": matching_mention['surface'],
                    "entity_id": entity_id,
                    "entity_link": wikipedia_entity_id,
                    "normalized_name": annotation.getNMEnormalizedName(),
                    "mention_entity_similarity": annotation.getMentionEntitySimilarity()
                }
            )
        output_doc['ne_links'] = linked_entities
        output.append(output_doc)
        
    return output

In [66]:
!s3cmd ls s3://processed-canonical-data/entities/aida-output/

In [11]:
aida_input_files = fixed_s3fs_glob("s3://processed-canonical-data/entities/aida-input/000*.bz2")

In [12]:
aida_input_files

['s3://processed-canonical-data/entities/aida-input/0000.bz2',
 's3://processed-canonical-data/entities/aida-input/0001.bz2',
 's3://processed-canonical-data/entities/aida-input/0002.bz2',
 's3://processed-canonical-data/entities/aida-input/0003.bz2',
 's3://processed-canonical-data/entities/aida-input/0004.bz2',
 's3://processed-canonical-data/entities/aida-input/0005.bz2',
 's3://processed-canonical-data/entities/aida-input/0006.bz2',
 's3://processed-canonical-data/entities/aida-input/0007.bz2',
 's3://processed-canonical-data/entities/aida-input/0008.bz2',
 's3://processed-canonical-data/entities/aida-input/0009.bz2']

In [13]:
len(aida_input_files)

10

In [19]:
aida_input_bag = db.read_text(
    aida_input_files,
    storage_options=IMPRESSO_STORAGEOPT
).map(json.loads).persist()

In [29]:
test = aida_input_bag.take(10)

In [31]:
aida_disambiguate_documents(test, gateway)

[{'id': 'BDC-1839-01-20-a-i0001',
  'sys_id': 'aidalight-fullSettings',
  'cdt': '2019-06-20 18:12:12',
  'ne_links': [{'mention_id': 'BDC-1839-01-20-a-i0001:0:9:person:nerb-0001',
    'surface': 'AVIS Dans',
    'entity_id': '--NME--',
    'entity_link': 'http://en.wikipedia.org/wiki/--NME--',
    'normalized_name': '--NME--',
    'mention_entity_similarity': -1.0},
   {'mention_id': 'BDC-1839-01-20-a-i0001:1438:1444:location:nerb-0001',
    'surface': 'Suisse',
    'entity_id': 'Suisse,_Moselle',
    'entity_link': 'http://en.wikipedia.org/wiki/Suisse,_Moselle',
    'normalized_name': 'Suisse,_Moselle',
    'mention_entity_similarity': -1.0}]},
 {'id': 'BDC-1839-01-20-a-i0002',
  'sys_id': 'aidalight-fullSettings',
  'cdt': '2019-06-20 18:12:12',
  'ne_links': [{'mention_id': 'BDC-1839-01-20-a-i0002:2691:2698:location:nerb-0001',
    'surface': 'Maurice',
    'entity_id': 'Maurice_Cox',
    'entity_link': 'http://en.wikipedia.org/wiki/Maurice_Cox',
    'normalized_name': 'Maurice_Cox

In [32]:
local_input_dir = "/media/romanell/4T/matteo/impresso-entities/aida-input/"
local_output_dir = "/media/romanell/4T/matteo/impresso-entities/aida-output/"

In [62]:
range_start = 0
range_end = 1000

In [66]:
aida_input_files = [
    os.path.join(local_input_dir, file)
    for file in os.listdir(local_input_dir)
    if int(file.replace(".bz2", "")) >= range_start and int(file.replace(".bz2", "")) < range_end
]

In [67]:
aida_output_files = [
    file.replace('aida-input', 'aida-output')
    for file in aida_input_files
]

In [68]:
len(aida_input_files)

1000

In [69]:
aida_input_files[:10]

['/media/romanell/4T/matteo/impresso-entities/aida-input/0013.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0128.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0973.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0058.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0095.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0601.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0099.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0433.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0721.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-input/0756.bz2']

In [71]:
aida_output_files[:10]

['/media/romanell/4T/matteo/impresso-entities/aida-output/0013.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0128.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0973.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0058.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0095.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0601.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0099.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0433.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0721.bz2',
 '/media/romanell/4T/matteo/impresso-entities/aida-output/0756.bz2']

In [None]:
aida_input_bag = db.read_text(
    aida_input_files,
    storage_options=IMPRESSO_STORAGEOPT
).map(json.loads)\
.map_partitions(aida_disambiguate_documents)\
.map(json.dumps)\
.to_textfiles(
    aida_output_files
    # storage_options=IMPRESSO_STORAGEOPT
)

## Disambiguation (s3 input)

In [17]:
# aida_input_path = f"s3://canonical-rebuilt/{np}/*.jsonl.bz2"

aida_input = "s3://processed-canonical-data/entities/aida-input/GDL/*.bz2"
aida_input_bag = db.read_text(
    aida_input,
    storage_options=IMPRESSO_STORAGEOPT
).map(json.loads)\
.map(aida_disambiguate)

In [19]:
%%time
aida_results = aida_input_bag\
    .map(json.dumps)\
    .to_textfiles(
        f"s3://processed-canonical-data/entities/aida-output/GDL/*.bz2",
        storage_options=IMPRESSO_STORAGEOPT
    )

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/romanell/.local/share/virtualenvs/impresso-ne-linking-3S6qaXXb/lib/python3.6/site-packages/py4j/java_gateway.py", line 1188, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/romanell/.local/share/virtualenvs/impresso-ne-linking-3S6qaXXb/lib/python3.6/site-packages/py4j/java_gateway.py", line 1014, in send_command
    response = connection.send_command(command)
  File "/home/romanell/.local/share/virtualenvs/impresso-ne-linking-3S6qaXXb/lib/python3.6/site-packages/py4j/java_gateway.py", line 1193, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


CPU times: user 1h 23min 39s, sys: 9min 14s, total: 1h 32min 54s
Wall time: 19h 11min 15s
