In [1]:
import pandas as pd
import numpy as np
import dask
import dask.dataframe as dd

# with open("/home/ubuntu/work/therapeutic_accelerator/scripts/base.py") as f:
#     exec(f.read())

In [2]:
# Create embeddings function with specter model
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

# @dask.delayed
def tokenize_abstracts(abstracts):
    inputs = tokenizer(abstracts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    return inputs

# @dask.delayed
def get_embeddings(inputs):
    result = model(**inputs).last_hidden_state[:, 0, :]
    return result

# @dask.delayed
# def get_embeddings(result):
#     embeddings = result.last_hidden_state[:, 0, :]
#     return embeddings

# def full_embedding_pipeline(abstract):
#     inputs = tokenizer(abstracts, padding=True, truncation=True, return_tensors="pt", max_length=512)
#     embeddings = model(**inputs).last_hidden_state[:, 0, :]
#     return embeddings


In [3]:
from dask.distributed import Client, LocalCluster, progress

cluster = LocalCluster(name='local', n_workers=2, memory_limit = '3GiB', threads_per_worker=2)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default

# client = Client(threads_per_worker=4, n_workers=10)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 2
Total threads: 4,Total memory: 6.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:35009,Workers: 2
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 6.00 GiB

0,1
Comm: tcp://127.0.0.1:44505,Total threads: 2
Dashboard: http://127.0.0.1:36147/status,Memory: 3.00 GiB
Nanny: tcp://127.0.0.1:37307,
Local directory: /tmp/dask-scratch-space/worker-hvewfsqg,Local directory: /tmp/dask-scratch-space/worker-hvewfsqg

0,1
Comm: tcp://127.0.0.1:43129,Total threads: 2
Dashboard: http://127.0.0.1:41103/status,Memory: 3.00 GiB
Nanny: tcp://127.0.0.1:35551,
Local directory: /tmp/dask-scratch-space/worker-166nsin_,Local directory: /tmp/dask-scratch-space/worker-166nsin_


In [4]:
df = dd.read_parquet('/home/ubuntu/work/data/abstracts/abstracts-1.parquet', columns = 'abstract', blocksize = '100MB')
abstracts = df.compute()
len(abstracts)

# turn backinto dask dataframe
dd_abs = dd.from_pandas(abstracts, npartitions=10)

In [5]:
tokens = dd_abs.apply(tokenize_abstracts, meta=('abstract', 'object'))
tokens = tokens.compute()
tokens = dd.from_pandas(tokens, npartitions=10)

In [6]:
embeddings = tokens.apply(get_embeddings, meta=('embeddings', 'object'))

In [None]:
embeddings = embeddings.compute()

In [None]:
import ipycytoscape

# visualize the low level Dask graph after optimizations
tokens.visualize(optimize_graph=True)

In [None]:
# # dask read sql table

# df = dd.read_sql_table('abstracts',
#                        f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres',
#                        index_col='id',
#                        npartitions=100)

# df.shape


# trying to write out to parquet to make things faster. Maybe it can be done in chunks?
# name_function = lambda x: f"abstracts-{x}.parquet"
# df.to_parquet('/home/ubuntu/work/data/abstracts/', name_function = name_function)

# # df = df.compute()
# # df.reset_index(drop = True, inplace = True)

In [None]:
futures = []
for i in range(100):
    futures.append(client.submit(full_embedding_pipeline, abstracts))

In [None]:
futures[1]

In [None]:
futures

In [None]:
inputs = futures[0].result()

In [None]:
test = model(**inputs).last_hidden_state[:, 0, :]

In [None]:
test.tolist()

In [None]:
import boto3

In [None]:
s3 = boto3.resource('s3')

# Print out bucket names
for bucket in s3.buckets.all():
    print(bucket.name)

In [None]:
import torch
torch.save(test, '/home/ubuntu/work/bucket/tensors_abstracts/tensor0-0.pt')

In [None]:
# import dask processingbar
from dask.diagnostics import ProgressBar

with ProgressBar():
    tokens = df['abstract'].apply(tokenize_abstracts, meta=('abstract', 'object')).compute()

In [None]:
tokenized = client.map(tokenize_abstracts, df['abstract'])
inputs = client.map(run_inputs, tokenized)
embeddings = client.submit(get_embeddings, inputs)

In [None]:
# import dask processingbar
from dask.diagnostics import ProgressBar

with ProgressBar():
    abstract_embeddings = ddf['abstract'].apply(get_embeddings, meta=('abstract', 'object')).compute()

In [None]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

In [None]:
# Create chroma client
chroma = chromadb.Client(Settings(chroma_api_impl="rest",
                                  chroma_server_host="54.175.241.78", # EC2 instance public IPv4
                                  chroma_server_http_port=8000))

print("Nanosecond heartbeat on server", chroma.heartbeat()) # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

# Check Existing connections
chroma.list_collections()