In [1]:
import os
from dask import dataframe as dd
from dask_k8 import DaskCluster
from impresso_commons.path.path_s3 import IMPRESSO_STORAGEOPT

## Set up the dask/kube cluster

In [2]:
kube_cfg = """
  containers:
    - image: daskdev/dask:1.1.5
      args: [dask-worker, $(DASK_SCHEDULER_ADDRESS), --nthreads, '1', --no-bokeh, --memory-limit, 5GB, --death-timeout, '120']
      imagePullPolicy: Always
      name: dask-worker
      env:
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: EXTRA_PIP_PACKAGES
          value: s3fs
        - name: EXTRA_CONDA_PACKAGES
          value:
        - name: SE_ACCESS_KEY
          value: {}
        - name: SE_SECRET_KEY
          value: {}
      resources:
        requests:
          cpu: 1
          memory: "5G"
        limits:
          cpu: 1
          memory: "5G"
      volumeMounts:
        - mountPath: /scratch
          name: scratch
          subPath: romanell
  volumes:
    - name: scratch
      persistentVolumeClaim:
        claimName: dhlab-scratch
""".format(
    os.environ["SE_ACCESS_KEY"],
    os.environ["SE_SECRET_KEY"]
)

In [3]:
cluster = DaskCluster(namespace="dhlab", cluster_id="matteo-dask", worker_pod_spec=kube_cfg)

In [4]:
cluster.create()
cluster.scale(10)

Scheduler: tcp://10.90.47.35:25803
Dashboard: http://10.90.47.35:24360
Currently 3 workers out of the 10 required, waiting...
Currently 8 workers out of the 10 required, waiting...
Reached the desired 10 workers!


## Read in passim data

In [5]:
client = cluster.make_dask_client()

In [13]:
input_bucket = "s3://processed-canonical-data/text-reuse/"
path_to_data = "test/out.json/"

In [14]:
df = dd.read_json(
    f"{os.path.join(input_bucket, path_to_data)}*.json", 
    storage_options=IMPRESSO_STORAGEOPT
)

In [15]:
df

Unnamed: 0_level_0,begin,cc,cluster,date,end,gid,id,pages,series,size,text,title,uid
npartitions=705,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,int64,bool,int64,datetime64[ns],int64,int64,object,object,object,int64,object,object,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...


In [10]:
n_rows_future, n_cols = df.shape

In [11]:
n_rows_future.compute()

1170211

In [14]:
df.head()

Unnamed: 0,begin,cc,cluster,date,end,gid,id,pages,series,size,text,title,uid
0,1670,True,32,1937-10-09,2151,7980028469543413257,IMP-1937-10-09-a-i0064,"[{'id': 'IMP-1937-10-09-a-p0005', 'seq': 5, 'r...",IMP,80757,"Dimanche 10 octobre\nRadio Suisse romande : 9,...",r^T ftAD/OPJJOMQÙE,-1539987974230298909
1,1968,True,32,1938-05-28,2484,7980028469543413257,IMP-1938-05-28-a-i0110,"[{'id': 'IMP-1938-05-28-a-p0009', 'seq': 9, 'r...",IMP,80757,"Causerie religieuse protestante. 19,30 Inter-\...",'Q&^ CHRONIQUE ^7 XAD/OPUOM/QUE,7517102616924565393
2,1931,True,32,1938-11-19,2643,7980028469543413257,IMP-1938-11-19-a-i0054,"[{'id': 'IMP-1938-11-19-a-p0005', 'seq': 5, 'r...",IMP,80757,Dimanche 20 novembre\nRadio Suisse romande : 9...,^M^ ^ CHRONIQUE 'rS? AAD/OP/JON/QUE,6256214283358341920
3,1787,True,32,1938-12-03,2424,7980028469543413257,IMP-1938-12-03-a-i0053,"[{'id': 'IMP-1938-12-03-a-p0005', 'seq': 5, 'r...",IMP,80757,danse.\nDimanche 4 décembre\nRadio Suisse roma...,£fe£ûl| CHRONIQUE r&lt;&gt;% RADIOPUONIQUE,5223345348506941832
4,2293,True,32,1938-12-10,3084,7980028469543413257,IMP-1938-12-10-a-i0089,"[{'id': 'IMP-1938-12-10-a-p0007', 'seq': 7, 'r...",IMP,80757,"gelées- 12,10 Le disque préféré de\nl'auditeur...",Q^^ CHRONIQUt r *7 P/ \ MO PU ON IQ Ut,-6034121963586273416


In [22]:
df.npartitions

705

In [23]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 13 entries, begin to uid
dtypes: datetime64[ns](1), object(5), bool(1), int64(6)

## Release resources

In [None]:
cluster.close()