In [207]:
import os
from dask import dataframe as dd
from dask_k8 import DaskCluster
from impresso_commons.path.path_s3 import IMPRESSO_STORAGEOPT

## Set up the dask/kube cluster

In [3]:
kube_cfg = """
  containers:
    - image: daskdev/dask:1.1.5
      args: [dask-worker, $(DASK_SCHEDULER_ADDRESS), --nthreads, '1', --no-bokeh, --memory-limit, 5GB, --death-timeout, '120']
      imagePullPolicy: Always
      name: dask-worker
      env:
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: EXTRA_PIP_PACKAGES
          value: s3fs
        - name: EXTRA_CONDA_PACKAGES
          value:
        - name: SE_ACCESS_KEY
          value: {}
        - name: SE_SECRET_KEY
          value: {}
      resources:
        requests:
          cpu: 1
          memory: "5G"
        limits:
          cpu: 1
          memory: "5G"
      volumeMounts:
        - mountPath: /scratch
          name: scratch
          subPath: romanell
  volumes:
    - name: scratch
      persistentVolumeClaim:
        claimName: dhlab-scratch
""".format(
    os.environ["SE_ACCESS_KEY"],
    os.environ["SE_SECRET_KEY"]
)

In [205]:
cluster = DaskCluster(namespace="dhlab", cluster_id="matteo-dask", worker_pod_spec=kube_cfg)

In [206]:
cluster.close()

In [7]:
cluster.create()
cluster.scale(40, blocking=False)

Scheduler: tcp://10.90.47.27:32079
Dashboard: http://10.90.47.27:14942


## Read in passim data

In [111]:
client = cluster.make_dask_client()

In [112]:
input_bucket = "s3://processed-canonical-data/text-reuse/"
path_to_data = "test/out.json/"
path_output = "test/clusters.json/"

In [113]:
df = dd.read_json(
    f"{os.path.join(input_bucket, path_to_data)}*.json", 
    storage_options=IMPRESSO_STORAGEOPT
).set_index('uid').persist()

In [114]:
df.head()

Unnamed: 0_level_0,begin,cc,cluster,date,end,gid,id,pages,series,size,text,title
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9223360541113828267,168,True,352187320617,1968-02-24,955,2328324961100034523,GDL-1968-02-24-a-i0240,"[{'id': 'GDL-1968-02-24-a-p0020', 'seq': 20, '...",GDL,10,TV.\n18.45 Bulletin de nouvelles.\n18.50 Publi...,• Tél é ^tr e bon plaisir
-9223360541113828267,39,True,32,1968-02-24,2017,2328324961100034523,GDL-1968-02-24-a-i0240,"[{'id': 'GDL-1968-02-24-a-p0020', 'seq': 20, '...",GDL,80757,SAMEDI 24 FÉVRIER\nProgramme suisse\n14. 00-15...,• Tél é ^tr e bon plaisir
-9223360541113828267,2027,True,32,1968-02-24,4301,2328324961100034523,GDL-1968-02-24-a-i0240,"[{'id': 'GDL-1968-02-24-a-p0020', 'seq': 20, '...",GDL,80757,DIMANCHE 25 FÉVRIER\nProgramme suisse\n10.15 P...,• Tél é ^tr e bon plaisir
-9223360541113828267,3166,True,1228360652012,1968-02-24,3457,2328324961100034523,GDL-1968-02-24-a-i0240,"[{'id': 'GDL-1968-02-24-a-p0020', 'seq': 20, '...",GDL,3,"17.15 Kiri le Clown.\n17.25 Ramuntcho (1958), ...",• Tél é ^tr e bon plaisir
-9223360541113828267,2093,True,17179874715,1968-02-24,2223,2328324961100034523,GDL-1968-02-24-a-i0240,"[{'id': 'GDL-1968-02-24-a-p0020', 'seq': 20, '...",GDL,55,"11.00 Un'ora per voi, reprise.\n12.00 Table ou...",• Tél é ^tr e bon plaisir


In [155]:
df.cluster.unique().compute()

0          352187320617
1                    32
2         1228360652012
3           17179874715
4         1314259992603
5            8589936556
6          180388630665
7          566935687689
8          566935687695
9          566935687691
10         377957127583
11         841813596481
12         695784706415
13         455266535644
14          34359742492
15         223338301704
16         283467841536
17          77309418644
18                 4326
19                 4328
20                 4329
21                 4330
22                 4331
23                 4327
24         747324317615
25         858993459200
26        1073741826470
27         773094117781
28         498216210450
29        1013612288284
              ...      
458539     592705493104
458540     206158434175
458541     678604839730
458542     231928240365
458543    1133871372599
458544      34359742823
458545     455266535639
458546     695784710894
458547      94489286979
458548      94489286978
458549     33500

In [117]:
clusters_df = df.groupby('cluster').agg({'date': ['min', 'max'], 'size': 'count'}).compute()

In [118]:
clusters_df.loc[:,'time_delta'] = clusters_df.date['max'] - clusters_df.date['min']

In [119]:
clusters_df.head()

Unnamed: 0_level_0,date,date,size,time_delta
Unnamed: 0_level_1,min,max,count,Unnamed: 4_level_1
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1982-07-24,1982-07-24,2,0 days
1,1982-07-24,1988-11-22,2,2313 days
2,1972-02-17,1972-02-17,2,0 days
3,1972-02-17,1972-02-17,2,0 days
4,1972-02-17,1972-02-17,2,0 days


In [141]:
# that's a silly one, just count TR instances per cluster
df.groupby('cluster')['size'].count().compute()

cluster
32               80757
89                2649
267                 65
279                 53
283                 35
287                 34
532               3256
674                642
821                152
3301               243
4326                 2
4327                 2
4328                 2
4329                 2
4330                 2
4331                 2
8589936556          35
8589938473          32
8589938915           2
8589938943           2
8589938944           2
8589939157          45
8589939175          71
8589940471         302
17179869184          3
17179869786         25
17179871344          2
17179873216          2
17179873350         27
17179874715         55
                 ...  
1159641178619        2
1271310322041        2
17179873390          2
17179873391          2
17179875608          2
34359742823          2
42949675053          2
42949679694          2
94489286978          2
94489286979          2
94489289113          2
188978564943         2
231

In [115]:
# get the list of newspapers for which a cluster contains TR instances
np = df.groupby('cluster').apply(lambda r:",".join(sorted(r['series'].unique())), meta=('np', object)).compute().sort_index()

In [120]:
clusters_df.loc[:, 'newspapers'] = np

In [121]:
clusters_df.head()

Unnamed: 0_level_0,date,date,size,time_delta,newspapers
Unnamed: 0_level_1,min,max,count,Unnamed: 4_level_1,Unnamed: 5_level_1
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,1982-07-24,1982-07-24,2,0 days,"GDL,IMP"
1,1982-07-24,1988-11-22,2,2313 days,"GDL,IMP"
2,1972-02-17,1972-02-17,2,0 days,"GDL,IMP"
3,1972-02-17,1972-02-17,2,0 days,"GDL,IMP"
4,1972-02-17,1972-02-17,2,0 days,"GDL,IMP"


In [123]:
def mint_document_ids(row):
    
    ids = row['id']
    begins = row['begin']
    ends = row['end']
    
    return ",".join([
        "{}@{}:{}".format(doc_id, begin, end)
        for doc_id, begin, end in zip(ids, begins, ends)
    ])
    
    return 

In [124]:
doc_ids = df.groupby('cluster').apply(mint_document_ids, meta=('document_ids', object)).compute().sort_index()

In [125]:
clusters_df.loc[:, 'members'] = doc_ids

In [126]:
clusters_df.head()

Unnamed: 0_level_0,date,date,size,time_delta,newspapers,members
Unnamed: 0_level_1,min,max,count,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,1982-07-24,1982-07-24,2,0 days,"GDL,IMP","IMP-1982-07-24-a-i0012@0:3756,GDL-1982-07-24-a..."
1,1982-07-24,1988-11-22,2,2313 days,"GDL,IMP","IMP-1982-07-24-a-i0012@3262:3452,GDL-1988-11-2..."
2,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@86:1439,IMP-1972-02-17-..."
3,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@2119:2795,IMP-1972-02-1..."
4,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@1456:2030,IMP-1972-02-1..."


In [44]:
np.intersect1d?

In [30]:
import re
import numpy as np

def lexicaloverlap(row):
    texts = row['text']
    first = True
    intersection = list()
    
    longest_text_lenght = max([len(text) for text in texts])
    
    for text in texts:
        if(first):
            first = False
            intersection = re.sub('[().,;:!0-9"{}\][»«]','',text).lower().split()
        else:
            processing = re.sub('[().,;:!0-9"{}\][»«]','',text).lower().split()
            intersection = np.intersect1d(processing, intersection)
            
    overlap_pct = (len(intersection) * 100) / longest_text_lenght
    return overlap_pct

In [127]:
import re
import numpy as np

def lexicaloverlap_revised(row):
    texts = row['text']
    first = True
    intersection = list()
    
    longest_text_length = max([len(text) for text in texts])
    
    token_sets = [
        set(re.sub('[().,;:!0-9"{}\][»«]','',text).lower().split())
        for text in texts
    ]
    
    longest_text_length = max([len(ts) for ts in token_sets])
    intersection = set.intersection(*token_sets)
    overlap_pct = (len(intersection) * 100) / longest_text_length
    return overlap_pct

In [31]:
overlap = df.groupby('cluster').apply(lexicaloverlap, meta=('lexical_overlap', object)).compute().sort_index()

In [128]:
overlap2 = df.groupby('cluster').apply(lexicaloverlap_revised, meta=('lexical_overlap2', object)).compute().sort_index()

In [129]:
overlap2.median()

67.5

In [130]:
clusters_df.loc[:, 'lexical_overlap'] = overlap2

In [131]:
clusters_df.head()

Unnamed: 0_level_0,date,date,size,time_delta,newspapers,members,lexical_overlap
Unnamed: 0_level_1,min,max,count,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,1982-07-24,1982-07-24,2,0 days,"GDL,IMP","IMP-1982-07-24-a-i0012@0:3756,GDL-1982-07-24-a...",76.265823
1,1982-07-24,1988-11-22,2,2313 days,"GDL,IMP","IMP-1982-07-24-a-i0012@3262:3452,GDL-1988-11-2...",64.285714
2,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@86:1439,IMP-1972-02-17-...",82.857143
3,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@2119:2795,IMP-1972-02-1...",82.142857
4,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@1456:2030,IMP-1972-02-1...",65.0


In [138]:
size = df.groupby('cluster').apply(lambda x: x['size'], meta=('cluster_size', object)).compute().sort_index()

In [146]:
clusters_df.loc[:, 'cluster_size'] = clusters_df['size']['count']

**TODO**: 
- for each cluster add a column `members` with IDs of content items in the cluster
    * example: `GDL-1968-02-24-a-i0240#168:955` (concatenate `id`, `begin`, `end`)
- `lexical overlap`
    * transform into a percentage
    * lowercase text before counting
    * take the length of longest text to compute the percentage
- try to write resulting dataframe to s3
- transform into a command line script

## Write to s3 (JSON)

In [180]:
clusters_df.head()

Unnamed: 0_level_0,date,date,size,time_delta,newspapers,members,lexical_overlap,cluster_size
Unnamed: 0_level_1,min,max,count,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,1982-07-24,1982-07-24,2,0 days,"GDL,IMP","IMP-1982-07-24-a-i0012@0:3756,GDL-1982-07-24-a...",76.265823,2
1,1982-07-24,1988-11-22,2,2313 days,"GDL,IMP","IMP-1982-07-24-a-i0012@3262:3452,GDL-1988-11-2...",64.285714,2
2,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@86:1439,IMP-1972-02-17-...",82.857143,2
3,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@2119:2795,IMP-1972-02-1...",82.142857,2
4,1972-02-17,1972-02-17,2,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@1456:2030,IMP-1972-02-1...",65.0,2


In [181]:
temp = clusters_df[["time_delta", "newspapers", "members", "lexical_overlap", "cluster_size"]].reset_index()

In [182]:
clusters_ddf = dd.from_pandas(temp, chunksize=1000)

In [184]:
clusters_ddf.index

Dask Index Structure:
npartitions=459
0         int64
1000        ...
          ...  
458000      ...
458568      ...
dtype: int64
Dask Name: from_pandas, 918 tasks

In [185]:
clusters_ddf.head()

Unnamed: 0,cluster,time_delta,newspapers,members,lexical_overlap,cluster_size
,,,,,,
0.0,0.0,0 days,"GDL,IMP","IMP-1982-07-24-a-i0012@0:3756,GDL-1982-07-24-a...",76.265823,2.0
1.0,1.0,2313 days,"GDL,IMP","IMP-1982-07-24-a-i0012@3262:3452,GDL-1988-11-2...",64.285714,2.0
2.0,2.0,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@86:1439,IMP-1972-02-17-...",82.857143,2.0
3.0,3.0,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@2119:2795,IMP-1972-02-1...",82.142857,2.0
4.0,4.0,0 days,"GDL,IMP","GDL-1972-02-17-a-i0078@1456:2030,IMP-1972-02-1...",65.0,2.0


In [186]:
path_output = 'test/clusters.json/'

In [157]:
dd.read_json?

In [None]:
# dask dataframes has a problem creating json output of a multi-index dataframe
# the json it creates it's not valid and cannot be read back into a dask dataframe
# so the format below is a workaround
output_files = dd.to_json(
    clusters_ddf.reset_index(),
    url_path=f"{os.path.join(input_bucket, path_output)}*.json", 
    orient='split',
    index=False,
    storage_options=IMPRESSO_STORAGEOPT
)

In [None]:
output_files

In [188]:
test = db.read_text(
    f"{os.path.join(input_bucket, path_output)}*.json", 
    storage_options=IMPRESSO_STORAGEOPT
).map(read_json).compute()

In [100]:
def read_json(string):
    try:
        return json.loads(string)
    except Exception:
        pass

In [101]:
test = db.read_text(
    f"{os.path.join(input_bucket, path_output)}*.json", 
    storage_options=IMPRESSO_STORAGEOPT
).map(read_json).compute()

In [109]:
json.loads?

## Write output to s3 (CSV)

In [62]:
clusters_ddf = dd.from_pandas(clusters_df, chunksize=1000)

In [77]:
path_output = 'test/clusters'

In [78]:
dd.to_csv(
    clusters_ddf,
    url_path=f"{os.path.join(input_bucket, path_output)}*.csv", 
    storage_options=IMPRESSO_STORAGEOPT
)

['processed-canonical-data/text-reuse/test/clusters-000.csv',
 'processed-canonical-data/text-reuse/test/clusters-001.csv',
 'processed-canonical-data/text-reuse/test/clusters-002.csv',
 'processed-canonical-data/text-reuse/test/clusters-003.csv',
 'processed-canonical-data/text-reuse/test/clusters-004.csv',
 'processed-canonical-data/text-reuse/test/clusters-005.csv',
 'processed-canonical-data/text-reuse/test/clusters-006.csv',
 'processed-canonical-data/text-reuse/test/clusters-007.csv',
 'processed-canonical-data/text-reuse/test/clusters-008.csv',
 'processed-canonical-data/text-reuse/test/clusters-009.csv',
 'processed-canonical-data/text-reuse/test/clusters-010.csv',
 'processed-canonical-data/text-reuse/test/clusters-011.csv',
 'processed-canonical-data/text-reuse/test/clusters-012.csv',
 'processed-canonical-data/text-reuse/test/clusters-013.csv',
 'processed-canonical-data/text-reuse/test/clusters-014.csv',
 'processed-canonical-data/text-reuse/test/clusters-015.csv',
 'proces

In [87]:
dd.to_json?

In [84]:
test_ddf = dd.read_csv(
    urlpath=f"{os.path.join(input_bucket, path_output)}-*.csv", 
    assume_missing=True,
    storage_options=IMPRESSO_STORAGEOPT
)

In [86]:
test_ddf.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 8 entries, Unnamed: 0 to lexical_overlap
dtypes: object(7), float64(1)

## Release resources

In [193]:
cluster.close()

In [204]:
!s3cmd ls s3://processed-canonical-data/text-reuse/test/clusters.json/

2019-05-24 10:50   3379737   s3://processed-canonical-data/text-reuse/test/clusters.json/000.json
2019-05-24 10:50    295705   s3://processed-canonical-data/text-reuse/test/clusters.json/001.json
2019-05-24 10:50    312058   s3://processed-canonical-data/text-reuse/test/clusters.json/002.json
2019-05-24 10:50    267720   s3://processed-canonical-data/text-reuse/test/clusters.json/003.json
2019-05-24 10:50    229227   s3://processed-canonical-data/text-reuse/test/clusters.json/004.json
2019-05-24 10:50    215796   s3://processed-canonical-data/text-reuse/test/clusters.json/005.json
2019-05-24 10:50    240130   s3://processed-canonical-data/text-reuse/test/clusters.json/006.json
2019-05-24 10:50    211657   s3://processed-canonical-data/text-reuse/test/clusters.json/007.json
2019-05-24 10:50    169349   s3://processed-canonical-data/text-reuse/test/clusters.json/008.json
2019-05-24 10:50    181646   s3://processed-canonical-data/text-reuse/test/clusters.json/009.json
2019-05-24

2019-05-24 10:50    121938   s3://processed-canonical-data/text-reuse/test/clusters.json/282.json
2019-05-24 10:50    121849   s3://processed-canonical-data/text-reuse/test/clusters.json/283.json
2019-05-24 10:50    121750   s3://processed-canonical-data/text-reuse/test/clusters.json/284.json
2019-05-24 10:50    121992   s3://processed-canonical-data/text-reuse/test/clusters.json/285.json
2019-05-24 10:50    121065   s3://processed-canonical-data/text-reuse/test/clusters.json/286.json
2019-05-24 10:50    122769   s3://processed-canonical-data/text-reuse/test/clusters.json/287.json
2019-05-24 10:50    120794   s3://processed-canonical-data/text-reuse/test/clusters.json/288.json
2019-05-24 10:50    122466   s3://processed-canonical-data/text-reuse/test/clusters.json/289.json
2019-05-24 10:50    122916   s3://processed-canonical-data/text-reuse/test/clusters.json/290.json
2019-05-24 10:50    120361   s3://processed-canonical-data/text-reuse/test/clusters.json/291.json
2019-05-24

In [201]:
!s3cmd del s3://processed-canonical-data/text-reuse/test/clusters*.json

delete: 's3://processed-canonical-data/text-reuse/test/clusters000.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters001.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters002.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters003.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters004.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters005.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters006.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters007.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters008.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters009.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters010.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters011.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters012.json'
delete: 's3://processed-canonical-data/text-reuse/t

delete: 's3://processed-canonical-data/text-reuse/test/clusters113.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters114.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters115.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters116.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters117.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters118.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters119.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters120.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters121.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters122.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters123.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters124.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters125.json'
delete: 's3://processed-canonical-data/text-reuse/t

delete: 's3://processed-canonical-data/text-reuse/test/clusters226.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters227.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters228.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters229.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters230.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters231.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters232.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters233.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters234.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters235.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters236.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters237.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters238.json'
delete: 's3://processed-canonical-data/text-reuse/t

delete: 's3://processed-canonical-data/text-reuse/test/clusters339.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters340.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters341.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters342.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters343.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters344.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters345.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters346.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters347.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters348.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters349.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters350.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters351.json'
delete: 's3://processed-canonical-data/text-reuse/t

delete: 's3://processed-canonical-data/text-reuse/test/clusters453.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters454.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters455.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters456.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters457.json'
delete: 's3://processed-canonical-data/text-reuse/test/clusters458.json'
